1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "suggest/core/result/suggestions_output_utils.h"
18 
19 #include <algorithm>
20 #include <vector>
21 
22 #include "dictionary/utils/binary_dictionary_shortcut_iterator.h"
23 #include "suggest/core/dicnode/dic_node.h"
24 #include "suggest/core/dicnode/dic_node_utils.h"
25 #include "suggest/core/dictionary/error_type_utils.h"
26 #include "suggest/core/policy/scoring.h"
27 #include "suggest/core/result/suggestion_results.h"
28 #include "suggest/core/session/dic_traverse_session.h"
29 #include "suggest/core/suggest_options.h"
30 
31 namespace latinime {
32 
33 const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
34 
outputSuggestions(const Scoring * const scoringPolicy,DicTraverseSession * traverseSession,const float weightOfLangModelVsSpatialModel,SuggestionResults * const outSuggestionResults)35 /* static */ void SuggestionsOutputUtils::outputSuggestions(
36         const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
37         const float weightOfLangModelVsSpatialModel,
38         SuggestionResults *const outSuggestionResults) {
39 #if DEBUG_EVALUATE_MOST_PROBABLE_STRING
40     const int terminalSize = 0;
41 #else
42     const int terminalSize = traverseSession->getDicTraverseCache()->terminalSize();
43 #endif
44     std::vector<DicNode> terminals(terminalSize);
45     for (int index = terminalSize - 1; index >= 0; --index) {
46         traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);
47     }
48     // Compute a weight of language model when an invalid weight is passed.
49     // NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL (-1) is taken as an invalid value.
50     const float weightOfLangModelVsSpatialModelToOutputSuggestions =
51             (weightOfLangModelVsSpatialModel < 0.0f)
52             ? scoringPolicy->getAdjustedWeightOfLangModelVsSpatialModel(traverseSession,
53                     terminals.data(), terminalSize)
54             : weightOfLangModelVsSpatialModel;
55     outSuggestionResults->setWeightOfLangModelVsSpatialModel(
56             weightOfLangModelVsSpatialModelToOutputSuggestions);
57     // Force autocorrection for obvious long multi-word suggestions when the top suggestion is
58     // a long multiple words suggestion.
59     // TODO: Implement a smarter auto-commit method for handling multi-word suggestions.
60     const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop()
61             && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT
62                     && !terminals.empty() && terminals.front().hasMultipleWords());
63     // TODO: have partial commit work even with multiple pointers.
64     const bool outputSecondWordFirstLetterInputIndex =
65             traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
66     const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()->
67             getHeaderStructurePolicy()->shouldBoostExactMatches();
68 
69     // Output suggestion results here
70     for (auto &terminalDicNode : terminals) {
71         outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode,
72                 weightOfLangModelVsSpatialModelToOutputSuggestions, boostExactMatches,
73                 forceCommitMultiWords, outputSecondWordFirstLetterInputIndex, outSuggestionResults);
74     }
75     scoringPolicy->getMostProbableString(traverseSession,
76             weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults);
77 }
78 
shouldBlockWord(const SuggestOptions * const suggestOptions,const DicNode * const terminalDicNode,const WordAttributes wordAttributes,const bool isLastWord)79 /* static */ bool SuggestionsOutputUtils::shouldBlockWord(
80         const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode,
81         const WordAttributes wordAttributes, const bool isLastWord) {
82     const bool currentWordExactMatch =
83             ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
84     // When we have to block offensive words, non-exact matched offensive words should not be
85     // output.
86     const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords();
87 
88     const bool isBlockedOffensiveWord = shouldBlockOffensiveWords &&
89             wordAttributes.isPossiblyOffensive();
90 
91     // This function is called in two situations:
92     //
93     // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode
94     //    of the search, and isLastWord will be true.
95     //                    "fuck"
96     //                        |
97     //                        \ terminalDicNode (isLastWord=true, currentWordExactMatch=true)
98     //    In this case, if the current word is an exact match, we will always let the word
99     //    through, even if the user is blocking offensive words (it's exactly what they typed!)
100     //
101     // 2) In the middle of the search, when we hit a terminal node, to decide whether or not
102     //    to start a new search at root, to try to match the rest of the input. In this case,
103     //    terminalDicNode will point to the terminal node we just hit, and isLastWord will be
104     //    false.
105     //                    "fuckvthis"
106     //                        |
107     //                        \ terminalDicNode (isLastWord=false, currentWordExactMatch=true)
108     //
109     // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this"
110     // when offensive words are blocked would be a bad idea).
111     //
112     // In the case of a multi-word correction where the offensive word is typed last (eg.
113     // for the input "allfuck"), this function will be called with isLastWord==true, but
114     // currentWordExactMatch==false. So we are OK in this case as well.
115     //                    "allfuck"
116     //                           |
117     //                           \ terminalDicNode (isLastWord=true, currentWordExactMatch=false)
118     if (isLastWord && currentWordExactMatch) {
119         return false;
120     } else {
121         return isBlockedOffensiveWord;
122     }
123 }
124 
outputSuggestionsOfDicNode(const Scoring * const scoringPolicy,DicTraverseSession * traverseSession,const DicNode * const terminalDicNode,const float weightOfLangModelVsSpatialModel,const bool boostExactMatches,const bool forceCommitMultiWords,const bool outputSecondWordFirstLetterInputIndex,SuggestionResults * const outSuggestionResults)125 /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
126         const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
127         const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel,
128         const bool boostExactMatches, const bool forceCommitMultiWords,
129         const bool outputSecondWordFirstLetterInputIndex,
130         SuggestionResults *const outSuggestionResults) {
131     if (DEBUG_GEO_FULL) {
132         terminalDicNode->dump("OUT:");
133     }
134     const float doubleLetterCost =
135             scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);
136     const float compoundDistance =
137             terminalDicNode->getCompoundDistance(weightOfLangModelVsSpatialModel)
138                     + doubleLetterCost;
139     const WordAttributes wordAttributes = traverseSession->getDictionaryStructurePolicy()
140             ->getWordAttributesInContext(terminalDicNode->getPrevWordIds(),
141                     terminalDicNode->getWordId(), nullptr /* multiBigramMap */);
142     const bool isExactMatch =
143             ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
144     const bool isExactMatchWithIntentionalOmission =
145             ErrorTypeUtils::isExactMatchWithIntentionalOmission(
146                     terminalDicNode->getContainedErrorTypes());
147     // TODO: Decide whether the word should be auto-corrected or not here.
148     const bool isAppropriateForAutoCorrection = !ErrorTypeUtils::isMissingExplicitAccent(
149             terminalDicNode->getContainedErrorTypes());
150     const int outputTypeFlags =
151             (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
152             | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
153             | (isExactMatchWithIntentionalOmission ?
154                     Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0)
155             | (isAppropriateForAutoCorrection ?
156                     Dictionary::KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION : 0);
157     // Entries that are blacklisted or do not represent a word should not be output.
158     const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord());
159 
160     const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(),
161             terminalDicNode, wordAttributes, true /* isLastWord */);
162 
163     // Increase output score of top typing suggestion to ensure autocorrection.
164     // TODO: Better integration with java side autocorrection logic.
165     const int finalScore = scoringPolicy->calculateFinalScore(
166             compoundDistance, traverseSession->getInputSize(),
167             terminalDicNode->getContainedErrorTypes(),
168             (forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
169             boostExactMatches, wordAttributes.getProbability() == 0);
170 
171     // Don't output invalid or blocked offensive words. However, we still need to submit their
172     // shortcuts if any.
173     if (isValidWord && !shouldBlockThisWord) {
174         int codePoints[MAX_WORD_LENGTH];
175         terminalDicNode->outputResult(codePoints);
176         const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
177                 terminalDicNode->getSecondWordFirstInputIndex(
178                         traverseSession->getProximityInfoState(0)) :
179                 NOT_AN_INDEX;
180         outSuggestionResults->addSuggestion(codePoints,
181                 terminalDicNode->getTotalNodeCodePointCount(),
182                 finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags,
183                 indexToPartialCommit, computeFirstWordConfidence(terminalDicNode));
184     }
185 
186     // Output shortcuts.
187     // Shortcut is not supported for multiple words suggestions.
188     // TODO: Check shortcuts during traversal for multiple words suggestions.
189     if (!terminalDicNode->hasMultipleWords()) {
190         BinaryDictionaryShortcutIterator shortcutIt =
191                 traverseSession->getDictionaryStructurePolicy()->getShortcutIterator(
192                         terminalDicNode->getWordId());
193         const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode);
194         outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults);
195     }
196 }
197 
computeFirstWordConfidence(const DicNode * const terminalDicNode)198 /* static */ int SuggestionsOutputUtils::computeFirstWordConfidence(
199         const DicNode *const terminalDicNode) {
200     // Get the number of spaces in the first suggestion
201     const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
202     // Get the number of characters in the first suggestion
203     const int length = terminalDicNode->getTotalNodeCodePointCount();
204     // Get the distance for the first word of the suggestion
205     const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
206 
207     // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
208     // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
209     // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
210     // we are very confident.
211     // Expected space count is 1 ~ 5
212     static const int MIN_EXPECTED_SPACE_COUNT = 1;
213     static const int MAX_EXPECTED_SPACE_COUNT = 5;
214     // Expected length is about 4 ~ 30
215     static const int MIN_EXPECTED_LENGTH = 4;
216     static const int MAX_EXPECTED_LENGTH = 30;
217     // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
218     static const float MIN_EXPECTED_DISTANCE = 0.0;
219     static const float MAX_EXPECTED_DISTANCE = 2.0;
220     // This is not strict: it's where most stuff will be falling, but it's still fine if it's
221     // outside these values. We want to output a value that reflects all of these. Each factor
222     // contributes a bit.
223 
224     // We need at least a space.
225     if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
226 
227     // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
228     // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
229     // weight of the distance. Clamp to avoid overflows.
230     const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
231             : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
232     const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
233             * (MAX_EXPECTED_DISTANCE - clampedDistance)
234             / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
235     // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
236     // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
237     // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
238     const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
239             * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
240     // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
241     // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
242     // weight of the space count.
243     const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
244             * (spaceCount - MIN_EXPECTED_SPACE_COUNT)
245             / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
246 
247     return distanceContribution + lengthContribution + spaceContribution;
248 }
249 
outputShortcuts(BinaryDictionaryShortcutIterator * const shortcutIt,const int finalScore,const bool sameAsTyped,SuggestionResults * const outSuggestionResults)250 /* static */ void SuggestionsOutputUtils::outputShortcuts(
251         BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore,
252         const bool sameAsTyped, SuggestionResults *const outSuggestionResults) {
253     int shortcutTarget[MAX_WORD_LENGTH];
254     while (shortcutIt->hasNextShortcutTarget()) {
255         bool isWhilelist;
256         int shortcutTargetStringLength;
257         shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget,
258                 &shortcutTargetStringLength, &isWhilelist);
259         int shortcutScore;
260         int kind;
261         if (isWhilelist && sameAsTyped) {
262             shortcutScore = S_INT_MAX;
263             kind = Dictionary::KIND_WHITELIST;
264         } else {
265             // shortcut entry's score == its base entry's score - 1
266             shortcutScore = finalScore;
267             // Protection against int underflow
268             shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1;
269             kind = Dictionary::KIND_SHORTCUT;
270         }
271         outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength,
272                 std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX,
273                 NOT_A_FIRST_WORD_CONFIDENCE);
274     }
275 }
276 } // namespace latinime
277