1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts.aggregation.util; 17 18 import android.util.ArrayMap; 19 import android.util.Log; 20 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType; 21 import com.android.providers.contacts.util.Hex; 22 23 import java.util.ArrayList; 24 import java.util.Collections; 25 import java.util.List; 26 27 /** 28 * Logic for matching raw contacts' data. 29 */ 30 public class RawContactMatcher { 31 private static final String TAG = "ContactMatcher"; 32 33 // Suggest to aggregate contacts if their match score is equal or greater than this threshold 34 public static final int SCORE_THRESHOLD_SUGGEST = 50; 35 36 public static final int SCORE_THRESHOLD_NO_NAME = 50; 37 38 // Automatically aggregate contacts if their match score is equal or greater than this threshold 39 public static final int SCORE_THRESHOLD_PRIMARY = 70; 40 41 // Automatically aggregate contacts if the match score is equal or greater than this threshold 42 // and there is a secondary match (phone number, email etc). 43 public static final int SCORE_THRESHOLD_SECONDARY = 50; 44 45 // Score for matching phone numbers 46 private static final int PHONE_MATCH_SCORE = 71; 47 48 // Score for matching email addresses 49 private static final int EMAIL_MATCH_SCORE = 71; 50 51 // Score for matching identity 52 private static final int IDENTITY_MATCH_SCORE = 71; 53 54 // Score for matching nickname 55 private static final int NICKNAME_MATCH_SCORE = 71; 56 57 // Maximum number of characters in a name to be considered by the matching algorithm. 58 private static final int MAX_MATCHED_NAME_LENGTH = 30; 59 60 // Scores a multiplied by this number to allow room for "fractional" scores 61 private static final int SCORE_SCALE = 1000; 62 63 public static final int MATCHING_ALGORITHM_EXACT = 0; 64 public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1; 65 public static final int MATCHING_ALGORITHM_APPROXIMATE = 2; 66 67 // Minimum edit distance between two names to be considered an approximate match 68 public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f; 69 70 // Minimum edit distance between two email ids to be considered an approximate match 71 public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f; 72 73 // Returned value when we found multiple matches and that was not allowed 74 public static final long MULTIPLE_MATCHES = -2; 75 76 /** 77 * Name matching scores: a matrix by name type vs. candidate lookup type. 78 * For example, if the name type is "full name" while we are looking for a 79 * "full name", the score may be 99. If we are looking for a "nickname" but 80 * find "first name", the score may be 50 (see specific scores defined 81 * below.) 82 * <p> 83 * For approximate matching, we have a range of scores, let's say 40-70. Depending one how 84 * similar the two strings are, the score will be somewhere between 40 and 70, with the exact 85 * match producing the score of 70. The score may also be 0 if the similarity (distance) 86 * between the strings is below the threshold. 87 * <p> 88 * We use a string matching algorithm, which is particularly suited for 89 * name matching. See {@link NameDistance}. 90 */ 91 private static int[] sMinScore = 92 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 93 private static int[] sMaxScore = 94 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 95 96 /* 97 * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE}, 98 * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are 99 * not! They are useful in three-way aggregation cases when we have, for example, both 100 * John Smith and Smith John. A third contact with the name John Smith should be aggregated 101 * with the former rather than the latter. This is why "reverse" matches have slightly lower 102 * scores than direct matches. 103 */ 104 static { setScoreRange(NameLookupType.NAME_EXACT, NameLookupType.NAME_EXACT, 99, 99)105 setScoreRange(NameLookupType.NAME_EXACT, 106 NameLookupType.NAME_EXACT, 99, 99); setScoreRange(NameLookupType.NAME_VARIANT, NameLookupType.NAME_VARIANT, 90, 90)107 setScoreRange(NameLookupType.NAME_VARIANT, 108 NameLookupType.NAME_VARIANT, 90, 90); setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NAME_COLLATION_KEY, 50, 80)109 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 110 NameLookupType.NAME_COLLATION_KEY, 50, 80); 111 setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.EMAIL_BASED_NICKNAME, 30, 60)112 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 113 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60); setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NICKNAME, 50, 60)114 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 115 NameLookupType.NICKNAME, 50, 60); 116 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)117 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 118 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)119 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 120 NameLookupType.NAME_COLLATION_KEY, 50, 60); setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NICKNAME, 50, 60)121 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 122 NameLookupType.NICKNAME, 50, 60); 123 setScoreRange(NameLookupType.NICKNAME, NameLookupType.NICKNAME, 50, 60)124 setScoreRange(NameLookupType.NICKNAME, 125 NameLookupType.NICKNAME, 50, 60); setScoreRange(NameLookupType.NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)126 setScoreRange(NameLookupType.NICKNAME, 127 NameLookupType.NAME_COLLATION_KEY, 50, 60); setScoreRange(NameLookupType.NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)128 setScoreRange(NameLookupType.NICKNAME, 129 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); 130 } 131 132 /** 133 * Populates the cells of the score matrix and score span matrix 134 * corresponding to the {@code candidateNameType} and {@code nameType}. 135 */ setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo)136 private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, 137 int scoreTo) { 138 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 139 sMinScore[index] = scoreFrom; 140 sMaxScore[index] = scoreTo; 141 } 142 143 /** 144 * Returns the lower range for the match score for the given {@code candidateNameType} and 145 * {@code nameType}. 146 */ getMinScore(int candidateNameType, int nameType)147 private static int getMinScore(int candidateNameType, int nameType) { 148 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 149 return sMinScore[index]; 150 } 151 152 /** 153 * Returns the upper range for the match score for the given {@code candidateNameType} and 154 * {@code nameType}. 155 */ getMaxScore(int candidateNameType, int nameType)156 private static int getMaxScore(int candidateNameType, int nameType) { 157 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 158 return sMaxScore[index]; 159 } 160 161 private final ArrayMap<Long, MatchScore> mScores = new ArrayMap<>(); 162 private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>(); 163 private int mScoreCount = 0; 164 165 private final NameDistance mNameDistanceConservative = new NameDistance(); 166 private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH); 167 getMatchingScore(long rawContactId, long contactId, long accountId)168 private MatchScore getMatchingScore(long rawContactId, long contactId, long accountId) { 169 MatchScore matchingScore = mScores.get(rawContactId); 170 if (matchingScore == null) { 171 if (mScoreList.size() > mScoreCount) { 172 matchingScore = mScoreList.get(mScoreCount); 173 matchingScore.reset(rawContactId, contactId, accountId); 174 } else { 175 matchingScore = new MatchScore(rawContactId, contactId, accountId); 176 mScoreList.add(matchingScore); 177 } 178 mScoreCount++; 179 mScores.put(rawContactId, matchingScore); 180 } 181 return matchingScore; 182 } 183 184 /** 185 * Checks if there is a match and updates the overall score for the 186 * specified contact for a discovered match. The new score is determined 187 * by the prior score, by the type of name we were looking for, the type 188 * of name we found and, if the match is approximate, the distance between the candidate and 189 * actual name. 190 */ matchName(long rawContactId, long contactId, long accountId, int candidateNameType, String candidateName, int nameType, String name, int algorithm)191 public void matchName(long rawContactId, long contactId, long accountId, int 192 candidateNameType, String candidateName, int nameType, String name, int algorithm) { 193 int maxScore = getMaxScore(candidateNameType, nameType); 194 if (maxScore == 0) { 195 return; 196 } 197 198 if (candidateName.equals(name)) { 199 updatePrimaryScore(rawContactId, contactId, accountId, maxScore); 200 return; 201 } 202 203 if (algorithm == MATCHING_ALGORITHM_EXACT) { 204 return; 205 } 206 207 int minScore = getMinScore(candidateNameType, nameType); 208 if (minScore == maxScore) { 209 return; 210 } 211 212 final byte[] decodedCandidateName; 213 final byte[] decodedName; 214 try { 215 decodedCandidateName = Hex.decodeHex(candidateName); 216 decodedName = Hex.decodeHex(name); 217 } catch (RuntimeException e) { 218 // How could this happen?? See bug 6827136 219 Log.e(TAG, "Failed to decode normalized name. Skipping.", e); 220 return; 221 } 222 223 NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ? 224 mNameDistanceConservative : mNameDistanceApproximate; 225 226 int score; 227 float distance = nameDistance.getDistance(decodedCandidateName, decodedName); 228 boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME 229 || nameType == NameLookupType.EMAIL_BASED_NICKNAME; 230 float threshold = emailBased 231 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL 232 : APPROXIMATE_MATCH_THRESHOLD; 233 if (distance > threshold) { 234 score = (int)(minScore + (maxScore - minScore) * (1.0f - distance)); 235 } else { 236 score = 0; 237 } 238 239 updatePrimaryScore(rawContactId, contactId, accountId, score); 240 } 241 matchIdentity(long rawContactId, long contactId, long accountId)242 public void matchIdentity(long rawContactId, long contactId, long accountId) { 243 updateSecondaryScore(rawContactId, contactId, accountId, IDENTITY_MATCH_SCORE); 244 } 245 updateScoreWithPhoneNumberMatch(long rawContactId, long contactId, long accountId)246 public void updateScoreWithPhoneNumberMatch(long rawContactId, long contactId, long accountId) { 247 updateSecondaryScore(rawContactId, contactId, accountId, PHONE_MATCH_SCORE); 248 } 249 updateScoreWithEmailMatch(long rawContactId, long contactId, long accountId)250 public void updateScoreWithEmailMatch(long rawContactId, long contactId, long accountId) { 251 updateSecondaryScore(rawContactId, contactId, accountId, EMAIL_MATCH_SCORE); 252 } 253 updateScoreWithNicknameMatch(long rawContactId, long contactId, long accountId)254 public void updateScoreWithNicknameMatch(long rawContactId, long contactId, long accountId) { 255 updateSecondaryScore(rawContactId, contactId, accountId, NICKNAME_MATCH_SCORE); 256 } 257 updatePrimaryScore(long rawContactId, long contactId, long accountId, int score)258 private void updatePrimaryScore(long rawContactId, long contactId, long accountId, int score) { 259 getMatchingScore(rawContactId, contactId, accountId).updatePrimaryScore(score); 260 } 261 updateSecondaryScore(long rawContactId, long contactId, long accountId, int score)262 private void updateSecondaryScore(long rawContactId, long contactId, long accountId, 263 int score) { 264 getMatchingScore(rawContactId, contactId, accountId).updateSecondaryScore(score); 265 } 266 keepIn(long rawContactId, long contactId, long accountId)267 public void keepIn(long rawContactId, long contactId, long accountId) { 268 getMatchingScore(rawContactId, contactId, accountId).keepIn(); 269 } 270 keepOut(long rawContactId, long contactId, long accountId)271 public void keepOut(long rawContactId, long contactId, long accountId) { 272 getMatchingScore(rawContactId, contactId, accountId).keepOut(); 273 } 274 clear()275 public void clear() { 276 mScores.clear(); 277 mScoreCount = 0; 278 } 279 /** 280 * Returns a list of IDs for raw contacts that are only matched on secondary data elements 281 * (phone number, email address, nickname, identity). We need to check if they are missing 282 * structured name or not to decide if they should be aggregated. 283 * <p> 284 * May return null. 285 */ prepareSecondaryMatchCandidates()286 public List<Long> prepareSecondaryMatchCandidates() { 287 ArrayList<Long> rawContactIds = null; 288 289 for (int i = 0; i < mScoreCount; i++) { 290 MatchScore score = mScoreList.get(i); 291 if (score.isKeepOut() || score.getPrimaryScore() > SCORE_THRESHOLD_PRIMARY){ 292 continue; 293 } 294 295 if (score.getSecondaryScore() >= SCORE_THRESHOLD_PRIMARY) { 296 if (rawContactIds == null) { 297 rawContactIds = new ArrayList<>(); 298 } 299 rawContactIds.add(score.getRawContactId()); 300 } 301 score.setPrimaryScore(0); 302 } 303 return rawContactIds; 304 } 305 306 /** 307 * Returns the list of raw contact Ids with the match score over threshold. 308 */ pickBestMatches()309 public List<MatchScore> pickBestMatches() { 310 final List<MatchScore> matches = new ArrayList<>(); 311 for (int i = 0; i < mScoreCount; i++) { 312 MatchScore score = mScoreList.get(i); 313 if (score.isKeepOut()) { 314 continue; 315 } 316 317 if (score.isKeepIn()) { 318 matches.add(score); 319 continue; 320 } 321 322 if (score.getPrimaryScore() >= SCORE_THRESHOLD_PRIMARY || 323 (score.getPrimaryScore() == SCORE_THRESHOLD_NO_NAME && 324 score.getSecondaryScore() > SCORE_THRESHOLD_SECONDARY)) { 325 matches.add(score); 326 } 327 } 328 return matches; 329 } 330 331 /** 332 * Returns matches in the order of descending score. 333 */ pickBestMatches(int threshold)334 public List<MatchScore> pickBestMatches(int threshold) { 335 int scaledThreshold = threshold * SCORE_SCALE; 336 List<MatchScore> matches = mScoreList.subList(0, mScoreCount); 337 Collections.sort(matches); 338 int count = 0; 339 for (int i = 0; i < mScoreCount; i++) { 340 MatchScore matchScore = matches.get(i); 341 if (matchScore.getScore() >= scaledThreshold) { 342 count++; 343 } else { 344 break; 345 } 346 } 347 348 return matches.subList(0, count); 349 } 350 351 @Override toString()352 public String toString() { 353 return mScoreList.subList(0, mScoreCount).toString(); 354 } 355 matchNoName(Long rawContactId, Long contactId, Long accountId)356 public void matchNoName(Long rawContactId, Long contactId, Long accountId) { 357 updatePrimaryScore(rawContactId, contactId, accountId, SCORE_THRESHOLD_NO_NAME); 358 } 359 } 360