Package gluon :: Module decoder
[hide private]
[frames] | no frames]

Source Code for Module gluon.decoder

 1  #!/usr/bin/env python 
 2  # -*- coding: utf-8 -*- 
 3   
 4  """ 
 5  Caller will hand this library a buffer and ask it to either convert 
 6  it or auto-detect the type. 
 7   
 8  Based on http://code.activestate.com/recipes/52257/ 
 9   
10  Licensed under the PSF License 
11  """ 
12   
13  import codecs 
14   
15  # None represents a potentially variable byte. "##" in the XML spec... 
16  autodetect_dict = {  # bytepattern     : ("name", 
17                                           (0x00, 0x00, 0xFE, 0xFF): ("ucs4_be"), 
18      (0xFF, 0xFE, 0x00, 0x00): ("ucs4_le"), 
19      (0xFE, 0xFF, None, None): ("utf_16_be"), 
20      (0xFF, 0xFE, None, None): ("utf_16_le"), 
21      (0x00, 0x3C, 0x00, 0x3F): ("utf_16_be"), 
22      (0x3C, 0x00, 0x3F, 0x00): ("utf_16_le"), 
23      (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"), 
24      (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC") 
25  } 
26   
27   
28 -def autoDetectXMLEncoding(buffer):
29 """ buffer -> encoding_name 30 The buffer should be at least 4 bytes long. 31 Returns None if encoding cannot be detected. 32 Note that encoding_name might not have an installed 33 decoder (e.g. EBCDIC) 34 """ 35 # a more efficient implementation would not decode the whole 36 # buffer at once but otherwise we'd have to decode a character at 37 # a time looking for the quote character...that's a pain 38 39 encoding = "utf_8" # according to the XML spec, this is the default 40 # this code successively tries to refine the default 41 # whenever it fails to refine, it falls back to 42 # the last place encoding was set. 43 if len(buffer) >= 4: 44 bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer[0:4])) 45 enc_info = autodetect_dict.get(bytes, None) 46 if not enc_info: # try autodetection again removing potentially 47 # variable bytes 48 bytes = (byte1, byte2, None, None) 49 enc_info = autodetect_dict.get(bytes) 50 else: 51 enc_info = None 52 53 if enc_info: 54 encoding = enc_info # we've got a guess... these are 55 #the new defaults 56 57 # try to find a more precise encoding using xml declaration 58 secret_decoder_ring = codecs.lookup(encoding)[1] 59 (decoded, length) = secret_decoder_ring(buffer) 60 first_line = decoded.split("\n")[0] 61 if first_line and first_line.startswith(u"<?xml"): 62 encoding_pos = first_line.find(u"encoding") 63 if encoding_pos != -1: 64 # look for double quote 65 quote_pos = first_line.find('"', encoding_pos) 66 67 if quote_pos == -1: # look for single quote 68 quote_pos = first_line.find("'", encoding_pos) 69 70 if quote_pos > -1: 71 quote_char, rest = (first_line[quote_pos], 72 first_line[quote_pos + 1:]) 73 encoding = rest[:rest.find(quote_char)] 74 75 return encoding
76 77
78 -def decoder(buffer):
79 encoding = autoDetectXMLEncoding(buffer) 80 return buffer.decode(encoding).encode('utf8')
81