1 """ Unicode Mapping Parser and Codec Generator.
3 This script parses Unicode mapping files as available from the Unicode
4 site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5 modules from them. The codecs use the standard character mapping codec
6 to actually apply the mapping.
8 Synopsis: gencodec.py dir codec_prefix
10 All files in dir are scanned and those producing non-empty mappings
11 will be written to <codec_prefix><mapname>.py with <mapname> being the
12 first part of the map's filename ('a' in a.b.c.txt) converted to
13 lowercase with hyphens replaced by underscores.
15 The tool also writes marshalled versions of the mapping tables to the
16 same location (with .mapping extension).
18 Written by Marc-Andre Lemburg (mal@lemburg.com).
20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21 (c) Copyright Guido van Rossum, 2000.
24 (c) Copyright Marc-Andre Lemburg, 2005.
25 Licensed to PSF under a Contributor Agreement.
29 import re, os, marshal, codecs
31 # Maximum allowed size of charmap tables
34 # Standard undefined Unicode code point
35 UNI_UNDEFINED = unichr(0xFFFE)
37 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
39 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
44 len=len, filter=filter,range=range):
46 """ Converts code combinations to either a single code integer
47 or a tuple of integers.
49 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
52 Empty codes or illegal ones are returned as None.
60 for i in range(len(l)):
65 l = filter(lambda x: x is not None, l)
71 def readmap(filename):
73 f = open(filename,'r')
80 # UTC mapping tables per convention don't include the identity
81 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
82 # explicitly mapped to different characters or undefined
83 for i in range(32) + [127]:
86 enc2uni[i] = (i, 'CONTROL CHARACTER')
90 if not line or line[0] == '#':
94 #print '* not matched: %s' % repr(line)
96 enc,uni,comment = m.groups()
102 comment = comment[1:].strip()
108 enc2uni[enc] = (uni,comment)
110 enc2uni[enc] = (uni,comment)
112 # If there are more identity-mapped entries than unmapped entries,
113 # it pays to generate an identity dictionary first, and add explicit
114 # mappings to None for the rest
115 if len(identity) >= len(unmapped):
117 enc2uni[enc] = (None, "")
118 enc2uni['IDENTITY'] = 256
122 def hexrepr(t, precision=4):
129 return '0x%0*X' % (precision, t)
131 return '(' + ', '.join(['0x%0*X' % (precision, item)
132 for item in t]) + ')'
133 except TypeError, why:
134 print '* failed to convert %r: %s' % (t, why)
137 def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
141 if map.has_key("IDENTITY"):
142 append("%s = codecs.make_identity_dict(range(%d))" %
143 (varname, map["IDENTITY"]))
144 append("%s.update({" % varname)
149 append("%s = {" % varname)
153 mappings = map.items()
156 key_precision, value_precision = precisions
157 for mapkey, mapvalue in mappings:
159 if isinstance(mapkey, tuple):
160 (mapkey, mapcomment) = mapkey
161 if isinstance(mapvalue, tuple):
162 (mapvalue, mapcomment) = mapvalue
166 mapkey == mapvalue and
168 # No need to include identity mappings, since these
169 # are already set for the first 256 code points.
171 key = hexrepr(mapkey, key_precision)
172 value = hexrepr(mapvalue, value_precision)
173 if mapcomment and comments:
174 append(' %s: %s,\t# %s' % (key, value, mapcomment))
176 append(' %s: %s,' % (key, value))
179 # Split the definition into parts to that the Python
180 # parser doesn't dump core
185 append('%s.update({' % varname)
195 def python_tabledef_code(varname, map, comments=1, key_precision=2):
199 append('%s = (' % varname)
201 # Analyze map and create table dict
202 mappings = map.items()
206 if map.has_key('IDENTITY'):
207 for key in range(256):
208 table[key] = (key, '')
211 for mapkey, mapvalue in mappings:
213 if isinstance(mapkey, tuple):
214 (mapkey, mapcomment) = mapkey
215 if isinstance(mapvalue, tuple):
216 (mapvalue, mapcomment) = mapvalue
219 table[mapkey] = (mapvalue, mapcomment)
222 if maxkey > MAX_TABLE_SIZE:
227 for key in range(maxkey + 1):
230 mapcomment = 'UNDEFINED'
232 mapvalue, mapcomment = table[key]
234 mapchar = UNI_UNDEFINED
236 if isinstance(mapvalue, tuple):
237 # 1-n mappings not supported
240 mapchar = unichr(mapvalue)
241 if mapcomment and comments:
242 append(' %r\t# %s -> %s' % (mapchar,
243 hexrepr(key, key_precision),
246 append(' %r' % mapchar)
251 def codegen(name, map, encodingname, comments=1):
253 """ Returns Python source for the given map.
255 Comments are included in the source, if comments is true (default).
259 decoding_map_code = python_mapdef_code(
263 decoding_table_code = python_tabledef_code(
267 encoding_map_code = python_mapdef_code(
269 codecs.make_encoding_map(map),
273 if decoding_table_code:
280 """ Python Character Mapping Codec %s generated from '%s' with gencodec.py.
288 class Codec(codecs.Codec):
290 def encode(self,input,errors='strict'):
291 return codecs.charmap_encode(input,errors,encoding_%s)
293 def decode(self,input,errors='strict'):
294 return codecs.charmap_decode(input,errors,decoding_%s)
295 ''' % (encodingname, name, suffix, suffix)]
297 class IncrementalEncoder(codecs.IncrementalEncoder):
298 def encode(self, input, final=False):
299 return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
301 class IncrementalDecoder(codecs.IncrementalDecoder):
302 def decode(self, input, final=False):
303 return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
307 class StreamWriter(Codec,codecs.StreamWriter):
310 class StreamReader(Codec,codecs.StreamReader):
313 ### encodings module API
316 return codecs.CodecInfo(
318 encode=Codec().encode,
319 decode=Codec().decode,
320 incrementalencoder=IncrementalEncoder,
321 incrementaldecoder=IncrementalDecoder,
322 streamreader=StreamReader,
323 streamwriter=StreamWriter,
325 ''' % encodingname.replace('_', '-'))
327 # Add decoding table or map (with preference to the table)
328 if not decoding_table_code:
332 l.extend(decoding_map_code)
337 l.extend(decoding_table_code)
340 if decoding_table_code:
343 encoding_table=codecs.charmap_build(decoding_table)
349 l.extend(encoding_map_code)
354 return '\n'.join(l).expandtabs()
356 def pymap(name,map,pyfile,encodingname,comments=1):
358 code = codegen(name,map,encodingname,comments)
363 def marshalmap(name,map,marshalfile):
366 for e,(u,c) in map.items():
368 f = open(marshalfile,'wb')
372 def convertdir(dir, dirprefix='', nameprefix='', comments=1):
374 mapnames = os.listdir(dir)
375 for mapname in mapnames:
376 mappathname = os.path.join(dir, mapname)
377 if not os.path.isfile(mappathname):
379 name = os.path.split(mapname)[1]
380 name = name.replace('-','_')
381 name = name.split('.')[0]
383 name = nameprefix + name
384 codefile = name + '.py'
385 marshalfile = name + '.mapping'
386 print 'converting %s to %s and %s' % (mapname,
387 dirprefix + codefile,
388 dirprefix + marshalfile)
390 map = readmap(os.path.join(dir,mapname))
392 print '* map is empty; skipping'
394 pymap(mappathname, map, dirprefix + codefile,name,comments)
395 marshalmap(mappathname, map, dirprefix + marshalfile)
396 except ValueError, why:
397 print '* conversion failed: %s' % why
400 def rewritepythondir(dir, dirprefix='', comments=1):
402 mapnames = os.listdir(dir)
403 for mapname in mapnames:
404 if not mapname.endswith('.mapping'):
406 name = mapname[:-len('.mapping')]
407 codefile = name + '.py'
408 print 'converting %s to %s' % (mapname,
409 dirprefix + codefile)
411 map = marshal.load(open(os.path.join(dir,mapname),
414 print '* map is empty; skipping'
416 pymap(mapname, map, dirprefix + codefile,name,comments)
417 except ValueError, why:
418 print '* conversion failed: %s' % why
420 if __name__ == '__main__':
424 apply(convertdir,tuple(sys.argv[1:]))
426 apply(rewritepythondir,tuple(sys.argv[1:]))