1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 package com.android.providers.contacts.aggregation.util;
17 
18 import android.util.ArrayMap;
19 import android.util.Log;
20 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
21 import com.android.providers.contacts.util.Hex;
22 
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.List;
26 
27 /**
28  * Logic for matching raw contacts' data.
29  */
30 public class RawContactMatcher {
31     private static final String TAG = "ContactMatcher";
32 
33     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
34     public static final int SCORE_THRESHOLD_SUGGEST = 50;
35 
36     public static final int SCORE_THRESHOLD_NO_NAME = 50;
37 
38     // Automatically aggregate contacts if their match score is equal or greater than this threshold
39     public static final int SCORE_THRESHOLD_PRIMARY = 70;
40 
41     // Automatically aggregate contacts if the match score is equal or greater than this threshold
42     // and there is a secondary match (phone number, email etc).
43     public static final int SCORE_THRESHOLD_SECONDARY = 50;
44 
45     // Score for matching phone numbers
46     private static final int PHONE_MATCH_SCORE = 71;
47 
48     // Score for matching email addresses
49     private static final int EMAIL_MATCH_SCORE = 71;
50 
51     // Score for matching identity
52     private static final int IDENTITY_MATCH_SCORE = 71;
53 
54     // Score for matching nickname
55     private static final int NICKNAME_MATCH_SCORE = 71;
56 
57     // Maximum number of characters in a name to be considered by the matching algorithm.
58     private static final int MAX_MATCHED_NAME_LENGTH = 30;
59 
60     // Scores a multiplied by this number to allow room for "fractional" scores
61     private static final int SCORE_SCALE = 1000;
62 
63     public static final int MATCHING_ALGORITHM_EXACT = 0;
64     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
65     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
66 
67     // Minimum edit distance between two names to be considered an approximate match
68     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
69 
70     // Minimum edit distance between two email ids to be considered an approximate match
71     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
72 
73     // Returned value when we found multiple matches and that was not allowed
74     public static final long MULTIPLE_MATCHES = -2;
75 
76     /**
77      * Name matching scores: a matrix by name type vs. candidate lookup type.
78      * For example, if the name type is "full name" while we are looking for a
79      * "full name", the score may be 99. If we are looking for a "nickname" but
80      * find "first name", the score may be 50 (see specific scores defined
81      * below.)
82      * <p>
83      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
84      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
85      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
86      * between the strings is below the threshold.
87      * <p>
88      * We use a string matching algorithm, which is particularly suited for
89      * name matching. See {@link NameDistance}.
90      */
91     private static int[] sMinScore =
92             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
93     private static int[] sMaxScore =
94             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
95 
96     /*
97      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
98      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
99      * not!  They are useful in three-way aggregation cases when we have, for example, both
100      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
101      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
102      * scores than direct matches.
103      */
104     static {
setScoreRange(NameLookupType.NAME_EXACT, NameLookupType.NAME_EXACT, 99, 99)105         setScoreRange(NameLookupType.NAME_EXACT,
106                 NameLookupType.NAME_EXACT, 99, 99);
setScoreRange(NameLookupType.NAME_VARIANT, NameLookupType.NAME_VARIANT, 90, 90)107         setScoreRange(NameLookupType.NAME_VARIANT,
108                 NameLookupType.NAME_VARIANT, 90, 90);
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NAME_COLLATION_KEY, 50, 80)109         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
110                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
111 
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.EMAIL_BASED_NICKNAME, 30, 60)112         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
113                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NICKNAME, 50, 60)114         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
115                 NameLookupType.NICKNAME, 50, 60);
116 
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)117         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
118                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)119         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
120                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NICKNAME, 50, 60)121         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
122                 NameLookupType.NICKNAME, 50, 60);
123 
setScoreRange(NameLookupType.NICKNAME, NameLookupType.NICKNAME, 50, 60)124         setScoreRange(NameLookupType.NICKNAME,
125                 NameLookupType.NICKNAME, 50, 60);
setScoreRange(NameLookupType.NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)126         setScoreRange(NameLookupType.NICKNAME,
127                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)128         setScoreRange(NameLookupType.NICKNAME,
129                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
130     }
131 
132     /**
133      * Populates the cells of the score matrix and score span matrix
134      * corresponding to the {@code candidateNameType} and {@code nameType}.
135      */
setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo)136     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom,
137             int scoreTo) {
138         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
139         sMinScore[index] = scoreFrom;
140         sMaxScore[index] = scoreTo;
141     }
142 
143     /**
144      * Returns the lower range for the match score for the given {@code candidateNameType} and
145      * {@code nameType}.
146      */
getMinScore(int candidateNameType, int nameType)147     private static int getMinScore(int candidateNameType, int nameType) {
148         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
149         return sMinScore[index];
150     }
151 
152     /**
153      * Returns the upper range for the match score for the given {@code candidateNameType} and
154      * {@code nameType}.
155      */
getMaxScore(int candidateNameType, int nameType)156     private static int getMaxScore(int candidateNameType, int nameType) {
157         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
158         return sMaxScore[index];
159     }
160 
161     private final ArrayMap<Long, MatchScore> mScores = new ArrayMap<>();
162     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
163     private int mScoreCount = 0;
164 
165     private final NameDistance mNameDistanceConservative = new NameDistance();
166     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
167 
getMatchingScore(long rawContactId, long contactId, long accountId)168     private MatchScore getMatchingScore(long rawContactId, long contactId, long accountId) {
169         MatchScore matchingScore = mScores.get(rawContactId);
170         if (matchingScore == null) {
171             if (mScoreList.size() > mScoreCount) {
172                 matchingScore = mScoreList.get(mScoreCount);
173                 matchingScore.reset(rawContactId, contactId, accountId);
174             } else {
175                 matchingScore = new MatchScore(rawContactId, contactId, accountId);
176                 mScoreList.add(matchingScore);
177             }
178             mScoreCount++;
179             mScores.put(rawContactId, matchingScore);
180         }
181         return matchingScore;
182     }
183 
184     /**
185      * Checks if there is a match and updates the overall score for the
186      * specified contact for a discovered match. The new score is determined
187      * by the prior score, by the type of name we were looking for, the type
188      * of name we found and, if the match is approximate, the distance between the candidate and
189      * actual name.
190      */
matchName(long rawContactId, long contactId, long accountId, int candidateNameType, String candidateName, int nameType, String name, int algorithm)191     public void matchName(long rawContactId, long contactId, long accountId, int
192             candidateNameType, String candidateName, int nameType, String name, int algorithm) {
193         int maxScore = getMaxScore(candidateNameType, nameType);
194         if (maxScore == 0) {
195             return;
196         }
197 
198         if (candidateName.equals(name)) {
199             updatePrimaryScore(rawContactId, contactId, accountId, maxScore);
200             return;
201         }
202 
203         if (algorithm == MATCHING_ALGORITHM_EXACT) {
204             return;
205         }
206 
207         int minScore = getMinScore(candidateNameType, nameType);
208         if (minScore == maxScore) {
209             return;
210         }
211 
212         final byte[] decodedCandidateName;
213         final byte[] decodedName;
214         try {
215             decodedCandidateName = Hex.decodeHex(candidateName);
216             decodedName = Hex.decodeHex(name);
217         } catch (RuntimeException e) {
218             // How could this happen??  See bug 6827136
219             Log.e(TAG, "Failed to decode normalized name.  Skipping.", e);
220             return;
221         }
222 
223         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
224                 mNameDistanceConservative : mNameDistanceApproximate;
225 
226         int score;
227         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
228         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
229                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
230         float threshold = emailBased
231                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
232                 : APPROXIMATE_MATCH_THRESHOLD;
233         if (distance > threshold) {
234             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
235         } else {
236             score = 0;
237         }
238 
239         updatePrimaryScore(rawContactId, contactId, accountId, score);
240     }
241 
matchIdentity(long rawContactId, long contactId, long accountId)242     public void matchIdentity(long rawContactId, long contactId, long accountId) {
243         updateSecondaryScore(rawContactId, contactId, accountId, IDENTITY_MATCH_SCORE);
244     }
245 
updateScoreWithPhoneNumberMatch(long rawContactId, long contactId, long accountId)246     public void updateScoreWithPhoneNumberMatch(long rawContactId, long contactId, long accountId) {
247         updateSecondaryScore(rawContactId, contactId, accountId, PHONE_MATCH_SCORE);
248     }
249 
updateScoreWithEmailMatch(long rawContactId, long contactId, long accountId)250     public void updateScoreWithEmailMatch(long rawContactId, long contactId, long accountId) {
251         updateSecondaryScore(rawContactId, contactId, accountId, EMAIL_MATCH_SCORE);
252     }
253 
updateScoreWithNicknameMatch(long rawContactId, long contactId, long accountId)254     public void updateScoreWithNicknameMatch(long rawContactId, long contactId, long accountId) {
255         updateSecondaryScore(rawContactId, contactId, accountId, NICKNAME_MATCH_SCORE);
256     }
257 
updatePrimaryScore(long rawContactId, long contactId, long accountId, int score)258     private void updatePrimaryScore(long rawContactId, long contactId, long accountId, int score) {
259         getMatchingScore(rawContactId, contactId, accountId).updatePrimaryScore(score);
260     }
261 
updateSecondaryScore(long rawContactId, long contactId, long accountId, int score)262     private void updateSecondaryScore(long rawContactId, long contactId, long accountId,
263             int score) {
264         getMatchingScore(rawContactId, contactId, accountId).updateSecondaryScore(score);
265     }
266 
keepIn(long rawContactId, long contactId, long accountId)267     public void keepIn(long rawContactId, long contactId, long accountId) {
268         getMatchingScore(rawContactId, contactId, accountId).keepIn();
269     }
270 
keepOut(long rawContactId, long contactId, long accountId)271     public void keepOut(long rawContactId, long contactId, long accountId) {
272         getMatchingScore(rawContactId, contactId, accountId).keepOut();
273     }
274 
clear()275     public void clear() {
276         mScores.clear();
277         mScoreCount = 0;
278     }
279     /**
280      * Returns a list of IDs for raw contacts that are only matched on secondary data elements
281      * (phone number, email address, nickname, identity). We need to check if they are missing
282      * structured name or not to decide if they should be aggregated.
283      * <p>
284      * May return null.
285      */
prepareSecondaryMatchCandidates()286     public List<Long> prepareSecondaryMatchCandidates() {
287         ArrayList<Long> rawContactIds = null;
288 
289         for (int i = 0; i < mScoreCount; i++) {
290             MatchScore score = mScoreList.get(i);
291             if (score.isKeepOut() ||  score.getPrimaryScore() > SCORE_THRESHOLD_PRIMARY){
292                 continue;
293             }
294 
295             if (score.getSecondaryScore() >= SCORE_THRESHOLD_PRIMARY) {
296                 if (rawContactIds == null) {
297                     rawContactIds = new ArrayList<>();
298                 }
299                 rawContactIds.add(score.getRawContactId());
300             }
301             score.setPrimaryScore(0);
302         }
303         return rawContactIds;
304     }
305 
306     /**
307      * Returns the list of raw contact Ids with the match score over threshold.
308      */
pickBestMatches()309     public List<MatchScore> pickBestMatches() {
310         final List<MatchScore> matches = new ArrayList<>();
311         for (int i = 0; i < mScoreCount; i++) {
312             MatchScore score = mScoreList.get(i);
313             if (score.isKeepOut()) {
314                 continue;
315             }
316 
317             if (score.isKeepIn()) {
318                 matches.add(score);
319                 continue;
320             }
321 
322             if (score.getPrimaryScore() >= SCORE_THRESHOLD_PRIMARY ||
323                     (score.getPrimaryScore() == SCORE_THRESHOLD_NO_NAME &&
324                             score.getSecondaryScore() > SCORE_THRESHOLD_SECONDARY)) {
325                 matches.add(score);
326             }
327         }
328         return matches;
329     }
330 
331     /**
332      * Returns matches in the order of descending score.
333      */
pickBestMatches(int threshold)334     public List<MatchScore> pickBestMatches(int threshold) {
335         int scaledThreshold = threshold * SCORE_SCALE;
336         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
337         Collections.sort(matches);
338         int count = 0;
339         for (int i = 0; i < mScoreCount; i++) {
340             MatchScore matchScore = matches.get(i);
341             if (matchScore.getScore() >= scaledThreshold) {
342                 count++;
343             } else {
344                 break;
345             }
346         }
347 
348         return matches.subList(0, count);
349     }
350 
351     @Override
toString()352     public String toString() {
353         return mScoreList.subList(0, mScoreCount).toString();
354     }
355 
matchNoName(Long rawContactId, Long contactId, Long accountId)356     public void matchNoName(Long rawContactId, Long contactId, Long accountId) {
357         updatePrimaryScore(rawContactId, contactId, accountId, SCORE_THRESHOLD_NO_NAME);
358     }
359 }
360