1 /*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "suggest/core/result/suggestions_output_utils.h"
18
19 #include <algorithm>
20 #include <vector>
21
22 #include "dictionary/utils/binary_dictionary_shortcut_iterator.h"
23 #include "suggest/core/dicnode/dic_node.h"
24 #include "suggest/core/dicnode/dic_node_utils.h"
25 #include "suggest/core/dictionary/error_type_utils.h"
26 #include "suggest/core/policy/scoring.h"
27 #include "suggest/core/result/suggestion_results.h"
28 #include "suggest/core/session/dic_traverse_session.h"
29 #include "suggest/core/suggest_options.h"
30
31 namespace latinime {
32
33 const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
34
outputSuggestions(const Scoring * const scoringPolicy,DicTraverseSession * traverseSession,const float weightOfLangModelVsSpatialModel,SuggestionResults * const outSuggestionResults)35 /* static */ void SuggestionsOutputUtils::outputSuggestions(
36 const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
37 const float weightOfLangModelVsSpatialModel,
38 SuggestionResults *const outSuggestionResults) {
39 #if DEBUG_EVALUATE_MOST_PROBABLE_STRING
40 const int terminalSize = 0;
41 #else
42 const int terminalSize = traverseSession->getDicTraverseCache()->terminalSize();
43 #endif
44 std::vector<DicNode> terminals(terminalSize);
45 for (int index = terminalSize - 1; index >= 0; --index) {
46 traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);
47 }
48 // Compute a weight of language model when an invalid weight is passed.
49 // NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL (-1) is taken as an invalid value.
50 const float weightOfLangModelVsSpatialModelToOutputSuggestions =
51 (weightOfLangModelVsSpatialModel < 0.0f)
52 ? scoringPolicy->getAdjustedWeightOfLangModelVsSpatialModel(traverseSession,
53 terminals.data(), terminalSize)
54 : weightOfLangModelVsSpatialModel;
55 outSuggestionResults->setWeightOfLangModelVsSpatialModel(
56 weightOfLangModelVsSpatialModelToOutputSuggestions);
57 // Force autocorrection for obvious long multi-word suggestions when the top suggestion is
58 // a long multiple words suggestion.
59 // TODO: Implement a smarter auto-commit method for handling multi-word suggestions.
60 const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop()
61 && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT
62 && !terminals.empty() && terminals.front().hasMultipleWords());
63 // TODO: have partial commit work even with multiple pointers.
64 const bool outputSecondWordFirstLetterInputIndex =
65 traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
66 const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()->
67 getHeaderStructurePolicy()->shouldBoostExactMatches();
68
69 // Output suggestion results here
70 for (auto &terminalDicNode : terminals) {
71 outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode,
72 weightOfLangModelVsSpatialModelToOutputSuggestions, boostExactMatches,
73 forceCommitMultiWords, outputSecondWordFirstLetterInputIndex, outSuggestionResults);
74 }
75 scoringPolicy->getMostProbableString(traverseSession,
76 weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults);
77 }
78
shouldBlockWord(const SuggestOptions * const suggestOptions,const DicNode * const terminalDicNode,const WordAttributes wordAttributes,const bool isLastWord)79 /* static */ bool SuggestionsOutputUtils::shouldBlockWord(
80 const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode,
81 const WordAttributes wordAttributes, const bool isLastWord) {
82 const bool currentWordExactMatch =
83 ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
84 // When we have to block offensive words, non-exact matched offensive words should not be
85 // output.
86 const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords();
87
88 const bool isBlockedOffensiveWord = shouldBlockOffensiveWords &&
89 wordAttributes.isPossiblyOffensive();
90
91 // This function is called in two situations:
92 //
93 // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode
94 // of the search, and isLastWord will be true.
95 // "fuck"
96 // |
97 // \ terminalDicNode (isLastWord=true, currentWordExactMatch=true)
98 // In this case, if the current word is an exact match, we will always let the word
99 // through, even if the user is blocking offensive words (it's exactly what they typed!)
100 //
101 // 2) In the middle of the search, when we hit a terminal node, to decide whether or not
102 // to start a new search at root, to try to match the rest of the input. In this case,
103 // terminalDicNode will point to the terminal node we just hit, and isLastWord will be
104 // false.
105 // "fuckvthis"
106 // |
107 // \ terminalDicNode (isLastWord=false, currentWordExactMatch=true)
108 //
109 // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this"
110 // when offensive words are blocked would be a bad idea).
111 //
112 // In the case of a multi-word correction where the offensive word is typed last (eg.
113 // for the input "allfuck"), this function will be called with isLastWord==true, but
114 // currentWordExactMatch==false. So we are OK in this case as well.
115 // "allfuck"
116 // |
117 // \ terminalDicNode (isLastWord=true, currentWordExactMatch=false)
118 if (isLastWord && currentWordExactMatch) {
119 return false;
120 } else {
121 return isBlockedOffensiveWord;
122 }
123 }
124
outputSuggestionsOfDicNode(const Scoring * const scoringPolicy,DicTraverseSession * traverseSession,const DicNode * const terminalDicNode,const float weightOfLangModelVsSpatialModel,const bool boostExactMatches,const bool forceCommitMultiWords,const bool outputSecondWordFirstLetterInputIndex,SuggestionResults * const outSuggestionResults)125 /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
126 const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
127 const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel,
128 const bool boostExactMatches, const bool forceCommitMultiWords,
129 const bool outputSecondWordFirstLetterInputIndex,
130 SuggestionResults *const outSuggestionResults) {
131 if (DEBUG_GEO_FULL) {
132 terminalDicNode->dump("OUT:");
133 }
134 const float doubleLetterCost =
135 scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);
136 const float compoundDistance =
137 terminalDicNode->getCompoundDistance(weightOfLangModelVsSpatialModel)
138 + doubleLetterCost;
139 const WordAttributes wordAttributes = traverseSession->getDictionaryStructurePolicy()
140 ->getWordAttributesInContext(terminalDicNode->getPrevWordIds(),
141 terminalDicNode->getWordId(), nullptr /* multiBigramMap */);
142 const bool isExactMatch =
143 ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
144 const bool isExactMatchWithIntentionalOmission =
145 ErrorTypeUtils::isExactMatchWithIntentionalOmission(
146 terminalDicNode->getContainedErrorTypes());
147 // TODO: Decide whether the word should be auto-corrected or not here.
148 const bool isAppropriateForAutoCorrection = !ErrorTypeUtils::isMissingExplicitAccent(
149 terminalDicNode->getContainedErrorTypes());
150 const int outputTypeFlags =
151 (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
152 | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
153 | (isExactMatchWithIntentionalOmission ?
154 Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0)
155 | (isAppropriateForAutoCorrection ?
156 Dictionary::KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION : 0);
157 // Entries that are blacklisted or do not represent a word should not be output.
158 const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord());
159
160 const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(),
161 terminalDicNode, wordAttributes, true /* isLastWord */);
162
163 // Increase output score of top typing suggestion to ensure autocorrection.
164 // TODO: Better integration with java side autocorrection logic.
165 const int finalScore = scoringPolicy->calculateFinalScore(
166 compoundDistance, traverseSession->getInputSize(),
167 terminalDicNode->getContainedErrorTypes(),
168 (forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
169 boostExactMatches, wordAttributes.getProbability() == 0);
170
171 // Don't output invalid or blocked offensive words. However, we still need to submit their
172 // shortcuts if any.
173 if (isValidWord && !shouldBlockThisWord) {
174 int codePoints[MAX_WORD_LENGTH];
175 terminalDicNode->outputResult(codePoints);
176 const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
177 terminalDicNode->getSecondWordFirstInputIndex(
178 traverseSession->getProximityInfoState(0)) :
179 NOT_AN_INDEX;
180 outSuggestionResults->addSuggestion(codePoints,
181 terminalDicNode->getTotalNodeCodePointCount(),
182 finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags,
183 indexToPartialCommit, computeFirstWordConfidence(terminalDicNode));
184 }
185
186 // Output shortcuts.
187 // Shortcut is not supported for multiple words suggestions.
188 // TODO: Check shortcuts during traversal for multiple words suggestions.
189 if (!terminalDicNode->hasMultipleWords()) {
190 BinaryDictionaryShortcutIterator shortcutIt =
191 traverseSession->getDictionaryStructurePolicy()->getShortcutIterator(
192 terminalDicNode->getWordId());
193 const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode);
194 outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults);
195 }
196 }
197
computeFirstWordConfidence(const DicNode * const terminalDicNode)198 /* static */ int SuggestionsOutputUtils::computeFirstWordConfidence(
199 const DicNode *const terminalDicNode) {
200 // Get the number of spaces in the first suggestion
201 const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
202 // Get the number of characters in the first suggestion
203 const int length = terminalDicNode->getTotalNodeCodePointCount();
204 // Get the distance for the first word of the suggestion
205 const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
206
207 // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
208 // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
209 // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
210 // we are very confident.
211 // Expected space count is 1 ~ 5
212 static const int MIN_EXPECTED_SPACE_COUNT = 1;
213 static const int MAX_EXPECTED_SPACE_COUNT = 5;
214 // Expected length is about 4 ~ 30
215 static const int MIN_EXPECTED_LENGTH = 4;
216 static const int MAX_EXPECTED_LENGTH = 30;
217 // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
218 static const float MIN_EXPECTED_DISTANCE = 0.0;
219 static const float MAX_EXPECTED_DISTANCE = 2.0;
220 // This is not strict: it's where most stuff will be falling, but it's still fine if it's
221 // outside these values. We want to output a value that reflects all of these. Each factor
222 // contributes a bit.
223
224 // We need at least a space.
225 if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
226
227 // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
228 // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
229 // weight of the distance. Clamp to avoid overflows.
230 const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
231 : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
232 const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
233 * (MAX_EXPECTED_DISTANCE - clampedDistance)
234 / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
235 // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
236 // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
237 // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
238 const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
239 * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
240 // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
241 // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
242 // weight of the space count.
243 const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
244 * (spaceCount - MIN_EXPECTED_SPACE_COUNT)
245 / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
246
247 return distanceContribution + lengthContribution + spaceContribution;
248 }
249
outputShortcuts(BinaryDictionaryShortcutIterator * const shortcutIt,const int finalScore,const bool sameAsTyped,SuggestionResults * const outSuggestionResults)250 /* static */ void SuggestionsOutputUtils::outputShortcuts(
251 BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore,
252 const bool sameAsTyped, SuggestionResults *const outSuggestionResults) {
253 int shortcutTarget[MAX_WORD_LENGTH];
254 while (shortcutIt->hasNextShortcutTarget()) {
255 bool isWhilelist;
256 int shortcutTargetStringLength;
257 shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget,
258 &shortcutTargetStringLength, &isWhilelist);
259 int shortcutScore;
260 int kind;
261 if (isWhilelist && sameAsTyped) {
262 shortcutScore = S_INT_MAX;
263 kind = Dictionary::KIND_WHITELIST;
264 } else {
265 // shortcut entry's score == its base entry's score - 1
266 shortcutScore = finalScore;
267 // Protection against int underflow
268 shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1;
269 kind = Dictionary::KIND_SHORTCUT;
270 }
271 outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength,
272 std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX,
273 NOT_A_FIRST_WORD_CONFIDENCE);
274 }
275 }
276 } // namespace latinime
277