1#!/usr/bin/env python 2 3import collections 4import copy 5import glob 6from os import path 7import re 8import sys 9from xml.etree import ElementTree 10 11from fontTools import ttLib 12 13EMOJI_VS = 0xFE0F 14 15LANG_TO_SCRIPT = { 16 'as': 'Beng', 17 'be': 'Cyrl', 18 'bg': 'Cyrl', 19 'bn': 'Beng', 20 'cu': 'Cyrl', 21 'cy': 'Latn', 22 'da': 'Latn', 23 'de': 'Latn', 24 'en': 'Latn', 25 'es': 'Latn', 26 'et': 'Latn', 27 'eu': 'Latn', 28 'fr': 'Latn', 29 'ga': 'Latn', 30 'gu': 'Gujr', 31 'hi': 'Deva', 32 'hr': 'Latn', 33 'hu': 'Latn', 34 'hy': 'Armn', 35 'ja': 'Jpan', 36 'kn': 'Knda', 37 'ko': 'Kore', 38 'la': 'Latn', 39 'ml': 'Mlym', 40 'mn': 'Cyrl', 41 'mr': 'Deva', 42 'nb': 'Latn', 43 'nn': 'Latn', 44 'or': 'Orya', 45 'pa': 'Guru', 46 'pt': 'Latn', 47 'sl': 'Latn', 48 'ta': 'Taml', 49 'te': 'Telu', 50 'tk': 'Latn', 51} 52 53def lang_to_script(lang_code): 54 lang = lang_code.lower() 55 while lang not in LANG_TO_SCRIPT: 56 hyphen_idx = lang.rfind('-') 57 assert hyphen_idx != -1, ( 58 'We do not know what script the "%s" language is written in.' 59 % lang_code) 60 assumed_script = lang[hyphen_idx+1:] 61 if len(assumed_script) == 4 and assumed_script.isalpha(): 62 # This is actually the script 63 return assumed_script.title() 64 lang = lang[:hyphen_idx] 65 return LANG_TO_SCRIPT[lang] 66 67 68def printable(inp): 69 if type(inp) is set: # set of character sequences 70 return '{' + ', '.join([printable(seq) for seq in inp]) + '}' 71 if type(inp) is tuple: # character sequence 72 return '<' + (', '.join([printable(ch) for ch in inp])) + '>' 73 else: # single character 74 return 'U+%04X' % inp 75 76 77def open_font(font): 78 font_file, index = font 79 font_path = path.join(_fonts_dir, font_file) 80 if index is not None: 81 return ttLib.TTFont(font_path, fontNumber=index) 82 else: 83 return ttLib.TTFont(font_path) 84 85 86def get_best_cmap(font): 87 ttfont = open_font(font) 88 all_unicode_cmap = None 89 bmp_cmap = None 90 for cmap in ttfont['cmap'].tables: 91 specifier = (cmap.format, cmap.platformID, cmap.platEncID) 92 if specifier == (4, 3, 1): 93 assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, ) 94 bmp_cmap = cmap 95 elif specifier == (12, 3, 10): 96 assert all_unicode_cmap is None, ( 97 'More than one UCS-4 cmap in %s' % (font, )) 98 all_unicode_cmap = cmap 99 100 return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap 101 102 103def get_variation_sequences_cmap(font): 104 ttfont = open_font(font) 105 vs_cmap = None 106 for cmap in ttfont['cmap'].tables: 107 specifier = (cmap.format, cmap.platformID, cmap.platEncID) 108 if specifier == (14, 0, 5): 109 assert vs_cmap is None, 'More than one VS cmap in %s' % (font, ) 110 vs_cmap = cmap 111 return vs_cmap 112 113 114def get_emoji_map(font): 115 # Add normal characters 116 emoji_map = copy.copy(get_best_cmap(font)) 117 reverse_cmap = {glyph: code for code, glyph in emoji_map.items() if not contains_pua(code) } 118 119 # Add variation sequences 120 vs_cmap = get_variation_sequences_cmap(font) 121 if vs_cmap: 122 for vs in vs_cmap.uvsDict: 123 for base, glyph in vs_cmap.uvsDict[vs]: 124 if glyph is None: 125 emoji_map[(base, vs)] = emoji_map[base] 126 else: 127 emoji_map[(base, vs)] = glyph 128 129 # Add GSUB rules 130 ttfont = open_font(font) 131 for lookup in ttfont['GSUB'].table.LookupList.Lookup: 132 if lookup.LookupType != 4: 133 # Other lookups are used in the emoji font for fallback. 134 # We ignore them for now. 135 continue 136 for subtable in lookup.SubTable: 137 ligatures = subtable.ligatures 138 for first_glyph in ligatures: 139 for ligature in ligatures[first_glyph]: 140 sequence = [first_glyph] + ligature.Component 141 sequence = [reverse_cmap[glyph] for glyph in sequence] 142 sequence = tuple(sequence) 143 # Make sure no starting subsequence of 'sequence' has been 144 # seen before. 145 for sub_len in range(2, len(sequence)+1): 146 subsequence = sequence[:sub_len] 147 assert subsequence not in emoji_map 148 emoji_map[sequence] = ligature.LigGlyph 149 150 return emoji_map 151 152 153def assert_font_supports_any_of_chars(font, chars): 154 best_cmap = get_best_cmap(font) 155 for char in chars: 156 if char in best_cmap: 157 return 158 sys.exit('None of characters in %s were found in %s' % (chars, font)) 159 160 161def assert_font_supports_all_of_chars(font, chars): 162 best_cmap = get_best_cmap(font) 163 for char in chars: 164 assert char in best_cmap, ( 165 'U+%04X was not found in %s' % (char, font)) 166 167 168def assert_font_supports_none_of_chars(font, chars, fallbackName): 169 best_cmap = get_best_cmap(font) 170 for char in chars: 171 if fallbackName: 172 assert char not in best_cmap, 'U+%04X was found in %s' % (char, font) 173 else: 174 assert char not in best_cmap, ( 175 'U+%04X was found in %s in fallback %s' % (char, font, fallbackName)) 176 177 178def assert_font_supports_all_sequences(font, sequences): 179 vs_dict = get_variation_sequences_cmap(font).uvsDict 180 for base, vs in sorted(sequences): 181 assert vs in vs_dict and (base, None) in vs_dict[vs], ( 182 '<U+%04X, U+%04X> was not found in %s' % (base, vs, font)) 183 184 185def check_hyphens(hyphens_dir): 186 # Find all the scripts that need automatic hyphenation 187 scripts = set() 188 for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')): 189 hyb_file = path.basename(hyb_file) 190 assert hyb_file.startswith('hyph-'), ( 191 'Unknown hyphenation file %s' % hyb_file) 192 lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')] 193 scripts.add(lang_to_script(lang_code)) 194 195 HYPHENS = {0x002D, 0x2010} 196 for script in scripts: 197 fonts = _script_to_font_map[script] 198 assert fonts, 'No fonts found for the "%s" script' % script 199 for font in fonts: 200 assert_font_supports_any_of_chars(font, HYPHENS) 201 202 203class FontRecord(object): 204 def __init__(self, name, psName, scripts, variant, weight, style, fallback_for, font): 205 self.name = name 206 self.psName = psName 207 self.scripts = scripts 208 self.variant = variant 209 self.weight = weight 210 self.style = style 211 self.fallback_for = fallback_for 212 self.font = font 213 214 215def parse_fonts_xml(fonts_xml_path): 216 global _script_to_font_map, _fallback_chains, _all_fonts 217 _script_to_font_map = collections.defaultdict(set) 218 _fallback_chains = {} 219 _all_fonts = [] 220 tree = ElementTree.parse(fonts_xml_path) 221 families = tree.findall('family') 222 # Minikin supports up to 254 but users can place their own font at the first 223 # place. Thus, 253 is the maximum allowed number of font families in the 224 # default collection. 225 assert len(families) < 254, ( 226 'System font collection can contains up to 253 font families.') 227 for family in families: 228 name = family.get('name') 229 variant = family.get('variant') 230 langs = family.get('lang') 231 if name: 232 assert variant is None, ( 233 'No variant expected for LGC font %s.' % name) 234 assert langs is None, ( 235 'No language expected for LGC fonts %s.' % name) 236 assert name not in _fallback_chains, 'Duplicated name entry %s' % name 237 _fallback_chains[name] = [] 238 else: 239 assert variant in {None, 'elegant', 'compact'}, ( 240 'Unexpected value for variant: %s' % variant) 241 242 trim_re = re.compile(r"^[ \n\r\t]*(.+)[ \n\r\t]*$") 243 for family in families: 244 name = family.get('name') 245 variant = family.get('variant') 246 langs = family.get('lang') 247 248 if langs: 249 langs = langs.split() 250 scripts = {lang_to_script(lang) for lang in langs} 251 else: 252 scripts = set() 253 254 for child in family: 255 assert child.tag == 'font', ( 256 'Unknown tag <%s>' % child.tag) 257 font_file = child.text.rstrip() 258 259 m = trim_re.match(font_file) 260 font_file = m.group(1) 261 262 weight = int(child.get('weight')) 263 assert weight % 100 == 0, ( 264 'Font weight "%d" is not a multiple of 100.' % weight) 265 266 style = child.get('style') 267 assert style in {'normal', 'italic'}, ( 268 'Unknown style "%s"' % style) 269 270 fallback_for = child.get('fallbackFor') 271 272 assert not name or not fallback_for, ( 273 'name and fallbackFor cannot be present at the same time') 274 assert not fallback_for or fallback_for in _fallback_chains, ( 275 'Unknown fallback name: %s' % fallback_for) 276 277 index = child.get('index') 278 if index: 279 index = int(index) 280 281 if not path.exists(path.join(_fonts_dir, m.group(1))): 282 continue # Missing font is a valid case. Just ignore the missing font files. 283 284 record = FontRecord( 285 name, 286 child.get('postScriptName'), 287 frozenset(scripts), 288 variant, 289 weight, 290 style, 291 fallback_for, 292 (font_file, index)) 293 294 _all_fonts.append(record) 295 296 if not fallback_for: 297 if not name or name == 'sans-serif': 298 for _, fallback in _fallback_chains.items(): 299 fallback.append(record) 300 else: 301 _fallback_chains[name].append(record) 302 else: 303 _fallback_chains[fallback_for].append(record) 304 305 if name: # non-empty names are used for default LGC fonts 306 map_scripts = {'Latn', 'Grek', 'Cyrl'} 307 else: 308 map_scripts = scripts 309 for script in map_scripts: 310 _script_to_font_map[script].add((font_file, index)) 311 312 313def check_emoji_coverage(all_emoji, equivalent_emoji): 314 emoji_fonts = get_emoji_fonts() 315 check_emoji_font_coverage(emoji_fonts, all_emoji, equivalent_emoji) 316 317 318def get_emoji_fonts(): 319 return [ record.font for record in _all_fonts if 'Zsye' in record.scripts ] 320 321def is_pua(x): 322 return 0xE000 <= x <= 0xF8FF or 0xF0000 <= x <= 0xFFFFD or 0x100000 <= x <= 0x10FFFD 323 324def contains_pua(sequence): 325 if type(sequence) is tuple: 326 return any([is_pua(x) for x in sequence]) 327 else: 328 return is_pua(sequence) 329 330def get_psname(ttf): 331 return str(next(x for x in ttf['name'].names 332 if x.platformID == 3 and x.platEncID == 1 and x.nameID == 6)) 333 334def check_emoji_compat(): 335 for emoji_font in get_emoji_fonts(): 336 ttf = open_font(emoji_font) 337 psname = get_psname(ttf) 338 339 # If the font file is NotoColorEmoji, it must be Compat font. 340 if psname == 'NotoColorEmoji': 341 meta = ttf['meta'] 342 assert meta, 'Compat font must have meta table' 343 assert 'Emji' in meta.data, 'meta table should have \'Emji\' data.' 344 345def check_emoji_font_coverage(emoji_fonts, all_emoji, equivalent_emoji): 346 coverages = [] 347 for emoji_font in emoji_fonts: 348 coverages.append(get_emoji_map(emoji_font)) 349 350 errors = [] 351 352 for sequence in all_emoji: 353 if all([sequence not in coverage for coverage in coverages]): 354 errors.append('%s is not supported in the emoji font.' % printable(sequence)) 355 356 for coverage in coverages: 357 for sequence in coverage: 358 if sequence in {0x0000, 0x000D, 0x0020}: 359 # The font needs to support a few extra characters, which is OK 360 continue 361 362 if contains_pua(sequence): 363 # The font needs to have some PUA for EmojiCompat library. 364 continue 365 366 if sequence not in all_emoji: 367 errors.append('%s support unexpected in the emoji font.' % printable(sequence)) 368 369 for first, second in equivalent_emoji.items(): 370 for coverage in coverages: 371 if first not in coverage or second not in coverage: 372 continue # sequence will be reported missing 373 if coverage[first] != coverage[second]: 374 errors.append('%s and %s should map to the same glyph.' % ( 375 printable(first), 376 printable(second))) 377 378 for coverage in coverages: 379 for glyph in set(coverage.values()): 380 maps_to_glyph = [ 381 seq for seq in coverage if coverage[seq] == glyph and not contains_pua(seq) ] 382 if len(maps_to_glyph) > 1: 383 # There are more than one sequences mapping to the same glyph. We 384 # need to make sure they were expected to be equivalent. 385 equivalent_seqs = set() 386 for seq in maps_to_glyph: 387 equivalent_seq = seq 388 while equivalent_seq in equivalent_emoji: 389 equivalent_seq = equivalent_emoji[equivalent_seq] 390 equivalent_seqs.add(equivalent_seq) 391 if len(equivalent_seqs) != 1: 392 errors.append('The sequences %s should not result in the same glyph %s' % ( 393 printable(equivalent_seqs), 394 glyph)) 395 396 assert not errors, '%d emoji font errors:\n%s\n%d emoji font coverage errors' % (len(errors), '\n'.join(errors), len(errors)) 397 398 399def check_emoji_defaults(default_emoji): 400 missing_text_chars = _emoji_properties['Emoji'] - default_emoji 401 for name, fallback_chain in _fallback_chains.items(): 402 emoji_font_seen = False 403 for record in fallback_chain: 404 if 'Zsye' in record.scripts: 405 emoji_font_seen = True 406 # No need to check the emoji font 407 continue 408 # For later fonts, we only check them if they have a script 409 # defined, since the defined script may get them to a higher 410 # score even if they appear after the emoji font. However, 411 # we should skip checking the text symbols font, since 412 # symbol fonts should be able to override the emoji display 413 # style when 'Zsym' is explicitly specified by the user. 414 if emoji_font_seen and (not record.scripts or 'Zsym' in record.scripts): 415 continue 416 417 # Check default emoji-style characters 418 assert_font_supports_none_of_chars(record.font, default_emoji, name) 419 420 # Mark default text-style characters appearing in fonts above the emoji 421 # font as seen 422 if not emoji_font_seen: 423 missing_text_chars -= set(get_best_cmap(record.font)) 424 425 # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and 426 # webdings yet. 427 missing_text_chars -= _chars_by_age['7.0'] 428 assert missing_text_chars == set(), ( 429 'Text style version of some emoji characters are missing: ' + 430 repr(missing_text_chars)) 431 432 433# Setting reverse to true returns a dictionary that maps the values to sets of 434# characters, useful for some binary properties. Otherwise, we get a 435# dictionary that maps characters to the property values, assuming there's only 436# one property in the file. 437def parse_unicode_datafile(file_path, reverse=False): 438 if reverse: 439 output_dict = collections.defaultdict(set) 440 else: 441 output_dict = {} 442 with open(file_path) as datafile: 443 for line in datafile: 444 if '#' in line: 445 line = line[:line.index('#')] 446 line = line.strip() 447 if not line: 448 continue 449 450 chars, prop = line.split(';')[:2] 451 chars = chars.strip() 452 prop = prop.strip() 453 454 if ' ' in chars: # character sequence 455 sequence = [int(ch, 16) for ch in chars.split(' ')] 456 additions = [tuple(sequence)] 457 elif '..' in chars: # character range 458 char_start, char_end = chars.split('..') 459 char_start = int(char_start, 16) 460 char_end = int(char_end, 16) 461 additions = range(char_start, char_end+1) 462 else: # singe character 463 additions = [int(chars, 16)] 464 if reverse: 465 output_dict[prop].update(additions) 466 else: 467 for addition in additions: 468 assert addition not in output_dict 469 output_dict[addition] = prop 470 return output_dict 471 472 473def parse_emoji_variants(file_path): 474 emoji_set = set() 475 text_set = set() 476 with open(file_path) as datafile: 477 for line in datafile: 478 if '#' in line: 479 line = line[:line.index('#')] 480 line = line.strip() 481 if not line: 482 continue 483 sequence, description, _ = line.split(';') 484 sequence = sequence.strip().split(' ') 485 base = int(sequence[0], 16) 486 vs = int(sequence[1], 16) 487 description = description.strip() 488 if description == 'text style': 489 text_set.add((base, vs)) 490 elif description == 'emoji style': 491 emoji_set.add((base, vs)) 492 return text_set, emoji_set 493 494 495def parse_ucd(ucd_path): 496 global _emoji_properties, _chars_by_age 497 global _text_variation_sequences, _emoji_variation_sequences 498 global _emoji_sequences, _emoji_zwj_sequences 499 _emoji_properties = parse_unicode_datafile( 500 path.join(ucd_path, 'emoji-data.txt'), reverse=True) 501 emoji_properties_additions = parse_unicode_datafile( 502 path.join(ucd_path, 'additions', 'emoji-data.txt'), reverse=True) 503 for prop in emoji_properties_additions.keys(): 504 _emoji_properties[prop].update(emoji_properties_additions[prop]) 505 506 _chars_by_age = parse_unicode_datafile( 507 path.join(ucd_path, 'DerivedAge.txt'), reverse=True) 508 sequences = parse_emoji_variants( 509 path.join(ucd_path, 'emoji-variation-sequences.txt')) 510 _text_variation_sequences, _emoji_variation_sequences = sequences 511 _emoji_sequences = parse_unicode_datafile( 512 path.join(ucd_path, 'emoji-sequences.txt')) 513 _emoji_sequences.update(parse_unicode_datafile( 514 path.join(ucd_path, 'additions', 'emoji-sequences.txt'))) 515 _emoji_zwj_sequences = parse_unicode_datafile( 516 path.join(ucd_path, 'emoji-zwj-sequences.txt')) 517 _emoji_zwj_sequences.update(parse_unicode_datafile( 518 path.join(ucd_path, 'additions', 'emoji-zwj-sequences.txt'))) 519 520 exclusions = parse_unicode_datafile(path.join(ucd_path, 'additions', 'emoji-exclusions.txt')) 521 _emoji_sequences = remove_emoji_exclude(_emoji_sequences, exclusions) 522 _emoji_zwj_sequences = remove_emoji_exclude(_emoji_zwj_sequences, exclusions) 523 _emoji_variation_sequences = remove_emoji_variation_exclude(_emoji_variation_sequences, exclusions) 524 # Unicode 12.0 adds Basic_Emoji in emoji-sequences.txt. We ignore them here since we are already 525 # checking the emoji presentations with emoji-variation-sequences.txt. 526 # Please refer to http://unicode.org/reports/tr51/#def_basic_emoji_set . 527 _emoji_sequences = {k: v for k, v in _emoji_sequences.items() if not v == 'Basic_Emoji' } 528 529 530def remove_emoji_variation_exclude(source, items): 531 return source.difference(items.keys()) 532 533def remove_emoji_exclude(source, items): 534 return {k: v for k, v in source.items() if k not in items} 535 536def flag_sequence(territory_code): 537 return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code) 538 539EQUIVALENT_FLAGS = { 540 flag_sequence('BV'): flag_sequence('NO'), 541 flag_sequence('CP'): flag_sequence('FR'), 542 flag_sequence('HM'): flag_sequence('AU'), 543 flag_sequence('SJ'): flag_sequence('NO'), 544 flag_sequence('UM'): flag_sequence('US'), 545} 546 547COMBINING_KEYCAP = 0x20E3 548 549LEGACY_ANDROID_EMOJI = { 550 0xFE4E5: flag_sequence('JP'), 551 0xFE4E6: flag_sequence('US'), 552 0xFE4E7: flag_sequence('FR'), 553 0xFE4E8: flag_sequence('DE'), 554 0xFE4E9: flag_sequence('IT'), 555 0xFE4EA: flag_sequence('GB'), 556 0xFE4EB: flag_sequence('ES'), 557 0xFE4EC: flag_sequence('RU'), 558 0xFE4ED: flag_sequence('CN'), 559 0xFE4EE: flag_sequence('KR'), 560 0xFE82C: (ord('#'), COMBINING_KEYCAP), 561 0xFE82E: (ord('1'), COMBINING_KEYCAP), 562 0xFE82F: (ord('2'), COMBINING_KEYCAP), 563 0xFE830: (ord('3'), COMBINING_KEYCAP), 564 0xFE831: (ord('4'), COMBINING_KEYCAP), 565 0xFE832: (ord('5'), COMBINING_KEYCAP), 566 0xFE833: (ord('6'), COMBINING_KEYCAP), 567 0xFE834: (ord('7'), COMBINING_KEYCAP), 568 0xFE835: (ord('8'), COMBINING_KEYCAP), 569 0xFE836: (ord('9'), COMBINING_KEYCAP), 570 0xFE837: (ord('0'), COMBINING_KEYCAP), 571} 572 573# This is used to define the emoji that should have the same glyph. 574# i.e. previously we had gender based Kiss (0x1F48F), which had the same glyph 575# with Kiss: Woman, Man (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468) 576# in that case a valid row would be: 577# (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F, 578ZWJ_IDENTICALS = { 579} 580 581SAME_FLAG_MAPPINGS = [ 582 # Diego Garcia and British Indian Ocean Territory 583 ((0x1F1EE, 0x1F1F4), (0x1F1E9, 0x1F1EC)), 584 # St. Martin and France 585 ((0x1F1F2, 0x1F1EB), (0x1F1EB, 0x1F1F7)), 586 # Spain and Ceuta & Melilla 587 ((0x1F1EA, 0x1F1F8), (0x1F1EA, 0x1F1E6)), 588] 589 590ZWJ = 0x200D 591 592def is_fitzpatrick_modifier(cp): 593 return 0x1F3FB <= cp <= 0x1F3FF 594 595 596def reverse_emoji(seq): 597 rev = list(reversed(seq)) 598 # if there are fitzpatrick modifiers in the sequence, keep them after 599 # the emoji they modify 600 for i in range(1, len(rev)): 601 if is_fitzpatrick_modifier(rev[i-1]): 602 rev[i], rev[i-1] = rev[i-1], rev[i] 603 return tuple(rev) 604 605 606def compute_expected_emoji(): 607 equivalent_emoji = {} 608 sequence_pieces = set() 609 all_sequences = set() 610 all_sequences.update(_emoji_variation_sequences) 611 612 # add zwj sequences not in the current emoji-zwj-sequences.txt 613 adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences) 614 adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences) 615 616 # Add empty flag tag sequence that is supported as fallback 617 _emoji_sequences[(0x1F3F4, 0xE007F)] = 'Emoji_Tag_Sequence' 618 619 for sequence in _emoji_sequences.keys(): 620 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS) 621 all_sequences.add(sequence) 622 sequence_pieces.update(sequence) 623 if _emoji_sequences.get(sequence, None) == 'Emoji_Tag_Sequence': 624 # Add reverse of all emoji ZWJ sequences, which are added to the 625 # fonts as a workaround to get the sequences work in RTL text. 626 # TODO: test if these are actually needed by Minikin/HarfBuzz. 627 reversed_seq = reverse_emoji(sequence) 628 all_sequences.add(reversed_seq) 629 equivalent_emoji[reversed_seq] = sequence 630 631 for sequence in adjusted_emoji_zwj_sequences.keys(): 632 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS) 633 all_sequences.add(sequence) 634 sequence_pieces.update(sequence) 635 # Add reverse of all emoji ZWJ sequences, which are added to the fonts 636 # as a workaround to get the sequences work in RTL text. 637 reversed_seq = reverse_emoji(sequence) 638 all_sequences.add(reversed_seq) 639 equivalent_emoji[reversed_seq] = sequence 640 641 for first, second in SAME_FLAG_MAPPINGS: 642 equivalent_emoji[first] = second 643 644 # Add all tag characters used in flags 645 sequence_pieces.update(range(0xE0030, 0xE0039 + 1)) 646 sequence_pieces.update(range(0xE0061, 0xE007A + 1)) 647 648 all_emoji = ( 649 _emoji_properties['Emoji'] | 650 all_sequences | 651 sequence_pieces | 652 set(LEGACY_ANDROID_EMOJI.keys())) 653 default_emoji = ( 654 _emoji_properties['Emoji_Presentation'] | 655 all_sequences | 656 set(LEGACY_ANDROID_EMOJI.keys())) 657 658 equivalent_emoji.update(EQUIVALENT_FLAGS) 659 equivalent_emoji.update(LEGACY_ANDROID_EMOJI) 660 equivalent_emoji.update(ZWJ_IDENTICALS) 661 662 for seq in _emoji_variation_sequences: 663 equivalent_emoji[seq] = seq[0] 664 665 return all_emoji, default_emoji, equivalent_emoji 666 667 668def check_compact_only_fallback(): 669 for name, fallback_chain in _fallback_chains.items(): 670 for record in fallback_chain: 671 if record.variant == 'compact': 672 same_script_elegants = [x for x in fallback_chain 673 if x.scripts == record.scripts and x.variant == 'elegant'] 674 assert same_script_elegants, ( 675 '%s must be in elegant of %s as fallback of "%s" too' % ( 676 record.font, record.scripts, record.fallback_for),) 677 678 679def check_vertical_metrics(): 680 for record in _all_fonts: 681 if record.name in ['sans-serif', 'sans-serif-condensed']: 682 font = open_font(record.font) 683 assert font['head'].yMax == 2163 and font['head'].yMin == -555, ( 684 'yMax and yMin of %s do not match expected values.' % ( 685 record.font,)) 686 687 if record.name in ['sans-serif', 'sans-serif-condensed', 688 'serif', 'monospace']: 689 font = open_font(record.font) 690 assert (font['hhea'].ascent == 1900 and 691 font['hhea'].descent == -500), ( 692 'ascent and descent of %s do not match expected ' 693 'values.' % (record.font,)) 694 695 696def check_cjk_punctuation(): 697 cjk_scripts = {'Hans', 'Hant', 'Jpan', 'Kore'} 698 cjk_punctuation = range(0x3000, 0x301F + 1) 699 for name, fallback_chain in _fallback_chains.items(): 700 for record in fallback_chain: 701 if record.scripts.intersection(cjk_scripts): 702 # CJK font seen. Stop checking the rest of the fonts. 703 break 704 assert_font_supports_none_of_chars(record.font, cjk_punctuation, name) 705 706def getPostScriptName(font): 707 font_file, index = font 708 font_path = path.join(_fonts_dir, font_file) 709 if index is not None: 710 # Use the first font file in the collection for resolving post script name. 711 ttf = ttLib.TTFont(font_path, fontNumber=0) 712 else: 713 ttf = ttLib.TTFont(font_path) 714 715 nameTable = ttf['name'] 716 for name in nameTable.names: 717 if (name.nameID == 6 and name.platformID == 3 and name.platEncID == 1 718 and name.langID == 0x0409): 719 return str(name) 720 721def check_canonical_name(): 722 for record in _all_fonts: 723 file_name, index = record.font 724 725 psName = getPostScriptName(record.font) 726 if record.psName: 727 # If fonts element has postScriptName attribute, it should match with the PostScript 728 # name in the name table. 729 assert psName == record.psName, ('postScriptName attribute %s should match with %s' % ( 730 record.psName, psName)) 731 else: 732 # If fonts element doesn't have postScriptName attribute, the file name should match 733 # with the PostScript name in the name table. 734 assert psName == file_name[:-4], ('file name %s should match with %s' % ( 735 file_name, psName)) 736 737 738def main(): 739 global _fonts_dir 740 target_out = sys.argv[1] 741 _fonts_dir = path.join(target_out, 'fonts') 742 743 fonts_xml_path = path.join(target_out, 'etc', 'fonts.xml') 744 parse_fonts_xml(fonts_xml_path) 745 746 check_compact_only_fallback() 747 748 check_vertical_metrics() 749 750 hyphens_dir = path.join(target_out, 'usr', 'hyphen-data') 751 check_hyphens(hyphens_dir) 752 753 check_cjk_punctuation() 754 755 check_canonical_name() 756 757 check_emoji = sys.argv[2] 758 if check_emoji == 'true': 759 ucd_path = sys.argv[3] 760 parse_ucd(ucd_path) 761 all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji() 762 check_emoji_compat() 763 check_emoji_coverage(all_emoji, equivalent_emoji) 764 check_emoji_defaults(default_emoji) 765 766 767if __name__ == '__main__': 768 main() 769