1
2
3
4 """
5 Caller will hand this library a buffer and ask it to either convert
6 it or auto-detect the type.
7
8 Based on http://code.activestate.com/recipes/52257/
9
10 Licensed under the PSF License
11 """
12
13 import codecs
14
15
16 autodetect_dict = {
17 (0x00, 0x00, 0xFE, 0xFF): ("ucs4_be"),
18 (0xFF, 0xFE, 0x00, 0x00): ("ucs4_le"),
19 (0xFE, 0xFF, None, None): ("utf_16_be"),
20 (0xFF, 0xFE, None, None): ("utf_16_le"),
21 (0x00, 0x3C, 0x00, 0x3F): ("utf_16_be"),
22 (0x3C, 0x00, 0x3F, 0x00): ("utf_16_le"),
23 (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"),
24 (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC")
25 }
26
27
29 """ buffer -> encoding_name
30 The buffer should be at least 4 bytes long.
31 Returns None if encoding cannot be detected.
32 Note that encoding_name might not have an installed
33 decoder (e.g. EBCDIC)
34 """
35
36
37
38
39 encoding = "utf_8"
40
41
42
43 if len(buffer) >= 4:
44 bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer[0:4]))
45 enc_info = autodetect_dict.get(bytes, None)
46 if not enc_info:
47
48 bytes = (byte1, byte2, None, None)
49 enc_info = autodetect_dict.get(bytes)
50 else:
51 enc_info = None
52
53 if enc_info:
54 encoding = enc_info
55
56
57
58 secret_decoder_ring = codecs.lookup(encoding)[1]
59 (decoded, length) = secret_decoder_ring(buffer)
60 first_line = decoded.split("\n")[0]
61 if first_line and first_line.startswith(u"<?xml"):
62 encoding_pos = first_line.find(u"encoding")
63 if encoding_pos != -1:
64
65 quote_pos = first_line.find('"', encoding_pos)
66
67 if quote_pos == -1:
68 quote_pos = first_line.find("'", encoding_pos)
69
70 if quote_pos > -1:
71 quote_char, rest = (first_line[quote_pos],
72 first_line[quote_pos + 1:])
73 encoding = rest[:rest.find(quote_char)]
74
75 return encoding
76
77
81