import re
import unicodedata
from collections import defaultdict
localmap = {
"LATIN SMALL LETTER AE":
["LATIN SMALL LETTER A", "LATIN SMALL LETTER E"],
"LATIN CAPITAL LETTER AE":
["LATIN CAPITAL LETTER A", "LATIN CAPITAL LETTER E"],
"LATIN SMALL LIGATURE OE":
["LATIN SMALL LETTER O", "LATIN SMALL LETTER E"],
"LATIN CAPITAL LIGATURE OE":
["LATIN CAPITAL LETTER O", "LATIN CAPITAL LETTER E"],
"LATIN SMALL LIGATURE IJ":
["LATIN SMALL LETTER I", "LATIN SMALL LETTER J"],
"LATIN CAPITAL LIGATURE IJ":
["LATIN CAPITAL LETTER I", "LATIN CAPITAL LETTER J"],
"LATIN SMALL LETTER SHARP S":
["LATIN SMALL LETTER S", "LATIN SMALL LETTER S"],
"LATIN SMALL LETTER DOTLESS I":
["LATIN SMALL LETTER I"],
"LATIN SMALL LETTER DOTLESS J WITH STROKE":
["LATIN SMALL LETTER J"],
'DOUBLE PRIME': ['QUOTATION MARK'],
'PRIME': ['APOSTROPHE'],
'SINGLE LEFT-POINTING ANGLE QUOTATION MARK': ['LESS-THAN SIGN'],
'SINGLE RIGHT-POINTING ANGLE QUOTATION MARK': ['GREATER-THAN SIGN'],
'HORIZONTAL ELLIPSIS': ['FULL STOP', 'FULL STOP', 'FULL STOP'],
'RIGHT SINGLE QUOTATION MARK': ['APOSTROPHE'],
'LEFT SINGLE QUOTATION MARK': ['APOSTROPHE'],
'SINGLE LOW-9 QUOTATION MARK': ['APOSTROPHE'],
'RIGHT DOUBLE QUOTATION MARK': ['QUOTATION MARK'],
'LEFT DOUBLE QUOTATION MARK': ['QUOTATION MARK'],
'DOUBLE LOW-9 QUOTATION MARK': ['QUOTATION MARK'],
'HYPHEN': ['HYPHEN-MINUS'],
'EN DASH': ['HYPHEN-MINUS'],
'EM DASH': ['HYPHEN-MINUS'],
'FIGURE DASH': ['HYPHEN-MINUS'],
'HORIZONTAL BAR': ['HYPHEN-MINUS'],
'MINUS SIGN': ['HYPHEN-MINUS'],
}
decomposition_table = {}
marks = set()
for code in xrange(0x10000):
char = unichr(code)
name = unicodedata.name(char, '')
decomposition = unicodedata.decomposition(char).split()
if decomposition:
if decomposition[0].startswith('<'):
del decomposition[0]
if decomposition:
decomposition_table[code] = [int(c, 16) for c in decomposition]
elif ' WITH ' in name:
decomposition_table[code] = None
category = unicodedata.category(char)
if category.startswith('M'):
marks.add(code)
for from_name, to_names in localmap.iteritems():
from_code = ord(unicodedata.lookup(from_name))
decomposition = []
for to_name in to_names:
to_code = ord(unicodedata.lookup(to_name))
decomposition.append(to_code)
decomposition_table[from_code] = decomposition
for from_code in sorted(decomposition_table.keys()):
decomposition = []
codes = [from_code]
while codes:
code = codes.pop(0)
old_decomposition = decomposition_table.get(code)
if old_decomposition:
codes.extend(old_decomposition)
continue
name = unicodedata.name(unichr(code))
match = re.match(r'(.*?)\s+WITH .*', name)
if match:
stripped_name = match.group(1)
try:
codes.append(ord(unicodedata.lookup(stripped_name)))
except KeyError:
pass
else:
continue
if code not in marks:
decomposition.append(code)
if decomposition and decomposition[0] != from_code:
decomposition_table[from_code] = decomposition
else:
del decomposition_table[from_code]
def reference(decomposition_table):
m = 0
for code_value in xrange(0x10000):
print "%04X" % (code_value,),
decomposition = decomposition_table.get(code_value)
if decomposition:
print "=>", " ".join("%04X" % (c,) for c in decomposition)
else:
print
def source(decomposition_table):
verbose = False
best_total_size = 10 * 1024 * 1024
for block_shift in xrange(2, 11):
block_count = 1 << block_shift
blocks = []
indexes = []
duplicate = 0
block = []
for code in xrange(0x10001):
if code > 0 and code % block_count == 0:
index = 0
for existing_block in blocks:
if block == existing_block:
indexes.append(index)
duplicate += 1
break
index += 1
else:
indexes.append(index)
blocks.append(block)
block = []
if code in decomposition_table:
block.append(decomposition_table[code])
else:
block.append([])
if verbose:
print len(blocks), "blocks of", block_count, "entries, factorized", duplicate, "blocks"
block_size = 0
for block in blocks:
block_size += sum(len(v) for v in block) * 2
block_size += len(blocks) * 4
block_size += block_count * len(blocks) * 2
index_size = (1 << (16 - block_shift)) * 2
total_size = block_size + index_size
if verbose:
print "\ttotal block size = ", block_size, ", index size = ", index_size
print "\ttotal size = ", total_size
if total_size < best_total_size:
best_total_size = total_size
best_blocks = blocks;
best_indexes = indexes
best_block_shift = block_shift
block_count = len(best_blocks)
block_size = 1 << best_block_shift
indexes_out = []
for i in range(0, len(best_indexes), 15):
indexes_out.append(','.join(map(str, best_indexes[i:i+15])))
positions_out = []
data1_out = []
data2_out = []
for i, block in enumerate(best_blocks):
position = 0
index = []
data = []
for entry in block:
index.append(position)
data.extend(entry)
position += len(entry)
index.append(position)
positions_out.append('{%s}' % ','.join(map(str, index)))
cc = []
for c in data:
if c <= 0xFFFF:
cc.append("0x%04X" % c)
else:
cc.append("0xFFFD")
data1_out.append('unsigned short unaccent_data_%d[] = {%s};' % (i, ','.join(cc)))
data2_out.append('unaccent_data_%d' % (i,))
data_template = '''/* Generated by builder. Do not modify. */
#define UNACCENT_BLOCK_SHIFT %(block_shift)s
#define UNACCENT_BLOCK_MASK ((1 << UNACCENT_BLOCK_SHIFT) - 1)
%(data1)s
unsigned short *unaccent_data[] = {
%(data2)s
};
'''
source = data_template % dict(
data1='\n'.join(data1_out),
data2=',\n'.join(data2_out),
block_shift=best_block_shift)
f = open("musicbrainz_unaccent_data.h", "w")
f.write(source)
indexes_template = '''
unsigned char unaccent_indexes[] = {
%(indexes)s
};
'''
source = indexes_template % dict(
indexes=',\n'.join(indexes_out))
f.write(source)
positions_template = '''
unsigned char unaccent_positions[%(y)s][%(x)s] = {
%(positions)s
};
'''
source = positions_template % dict(
positions=',\n'.join(positions_out),
y=len(positions_out),
x=block_size+1)
f.write(source)
f.close()
source(decomposition_table)