1
2
3 """
4 This file is part of the web2py Web Framework
5 Copyrighted by Massimo Di Pierro <mdipierro@cs.depaul.edu>
6 License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html)
7
8 Created by Vladyslav Kozlovskyy (Ukraine) <dbdevelop©gmail.com>
9 for Web2py project
10
11 Utilities and class for UTF8 strings managing
12 ===========================================
13 """
14 import __builtin__
15 __all__ = ['Utf8']
16
17 repr_escape_tab = {}
18 for i in range(1, 32):
19 repr_escape_tab[i] = ur'\x%02x' % i
20 repr_escape_tab[7] = u'\\a'
21 repr_escape_tab[8] = u'\\b'
22 repr_escape_tab[9] = u'\\t'
23 repr_escape_tab[10] = u'\\n'
24 repr_escape_tab[11] = u'\\v'
25 repr_escape_tab[12] = u'\\f'
26 repr_escape_tab[13] = u'\\r'
27 repr_escape_tab[ord('\\')] = u'\\\\'
28 repr_escape_tab2 = repr_escape_tab.copy()
29 repr_escape_tab2[ord('\'')] = u"\\'"
30
31
33 """ Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/)
34 is used for utf-8 and unicode strings sorting and for utf-8 strings
35 comparison
36
37 NOTE: pyuca is a very memory cost module! It loads the whole
38 "allkey.txt" file (~2mb!) into the memory. But this
39 functionality is needed only when sort_key() is called as a
40 part of sort() function or when Utf8 strings are compared.
41
42 So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS
43 FIRST CALL) imports pyuca and replaces itself with a real
44 sort_key() function
45 """
46 global sort_key
47 try:
48 from gluon.contrib.pyuca import unicode_collator
49 unicode_sort_key = unicode_collator.sort_key
50 sort_key = lambda s: unicode_sort_key(
51 unicode(s, 'utf-8') if isinstance(s, str) else s)
52 except:
53 sort_key = lambda s: (
54 unicode(s, 'utf-8') if isinstance(s, str) else s).lower()
55 return sort_key(s)
56
57
59 """ returns unicode id for utf8 or unicode *char* character
60
61 SUPPOSE that *char* is an utf-8 or unicode character only
62 """
63 if isinstance(char, unicode):
64 return __builtin__.ord(char)
65 return __builtin__.ord(unicode(char, 'utf-8'))
66
67
69 """ return utf8-character with *code* unicode id """
70 return Utf8(unichr(code))
71
72
74 """ return length of utf-8 string in bytes
75 NOTE! The length of correspondent utf-8
76 string is returned for unicode string
77 """
78 return Utf8(string).__size__()
79
80
81 -def truncate(string, length, dots='...'):
82 """ returns string of length < *length* or truncate
83 string with adding *dots* suffix to the string's end
84
85 args:
86 length (int): max length of string
87 dots (str or unicode): string suffix, when string is cutted
88
89 returns:
90 (utf8-str): original or cutted string
91 """
92 text = unicode(string, 'utf-8')
93 dots = unicode(dots, 'utf-8') if isinstance(dots, str) else dots
94 if len(text) > length:
95 text = text[:length - len(dots)] + dots
96 return str.__new__(Utf8, text.encode('utf-8'))
97
98
100 """
101 Class for utf8 string storing and manipulations
102
103 The base presupposition of this class usage is:
104 "ALL strings in the application are either of
105 utf-8 or unicode type, even when simple str
106 type is used. UTF-8 is only a "packed" version
107 of unicode, so Utf-8 and unicode strings are
108 interchangeable."
109
110 CAUTION! This class is slower than str/unicode!
111 Do NOT use it inside intensive loops. Simply
112 decode string(s) to unicode before loop and
113 encode it back to utf-8 string(s) after
114 intensive calculation.
115
116 You can see the benefit of this class in doctests() below
117 """
118 - def __new__(cls, content='', codepage='utf-8'):
119 if isinstance(content, unicode):
120 return str.__new__(cls, unicode.encode(content, 'utf-8'))
121 elif codepage in ('utf-8', 'utf8') or isinstance(content, cls):
122 return str.__new__(cls, content)
123 else:
124 return str.__new__(cls, unicode(content, codepage).encode('utf-8'))
125
127 r''' # note that we use raw strings to avoid having to use double back slashes below
128 NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function
129
130 utf8.__repr__() works same as str.repr() when processing ascii string
131 >>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'"
132 True
133 >>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\''
134 True
135 >>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"'
136 True
137 >>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\''
138 True
139 >>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n
140 True
141
142 Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string
143 >>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字')
144 True
145 >>> repr(Utf8('中"文"字')) == "'中\"文\"字'" != repr('中"文"字')
146 True
147 >>> repr(Utf8("中'文'字")) == '"中\'文\'字"' != repr("中'文'字")
148 True
149 >>> repr(Utf8('中\'文"字')) == repr(Utf8("中'文\"字")) == '\'中\\\'文"字\'' != repr('中\'文"字') == repr("中'文\"字")
150 True
151 >>> repr(Utf8('中\r\n文')) == "'中\\r\\n文'" != repr('中\r\n文') # Test for \r, \n
152 True
153 '''
154 if str.find(self, "'") >= 0 and str.find(self, '"') < 0:
155 return '"' + unicode(self, 'utf-8').translate(repr_escape_tab).encode('utf-8') + '"'
156 else:
157 return "'" + unicode(self, 'utf-8').translate(repr_escape_tab2).encode('utf-8') + "'"
158
160 """ length of utf-8 string in bytes """
161 return str.__len__(self)
162
165
168
171
173 return str.__new__(Utf8, str.__add__(self, unicode.encode(other, 'utf-8')
174 if isinstance(other, unicode) else other))
175
177 return len(unicode(self, 'utf-8'))
178
181
184
187
190
193
196
199
202
203 - def index(self, string):
204 return unicode(self, 'utf-8').index(string if isinstance(string, unicode) else unicode(string, 'utf-8'))
205
207 return unicode(self, 'utf-8').isalnum()
208
210 return unicode(self, 'utf-8').isalpha()
211
213 return unicode(self, 'utf-8').isdigit()
214
216 return unicode(self, 'utf-8').islower()
217
219 return unicode(self, 'utf-8').isspace()
220
222 return unicode(self, 'utf-8').istitle()
223
225 return unicode(self, 'utf-8').isupper()
226
227 - def zfill(self, length):
229
230 - def join(self, iter):
231 return str.__new__(Utf8, str.join(self, [Utf8(c) for c in
232 list(unicode(iter, 'utf-8') if
233 isinstance(iter, str) else
234 iter)]))
235
236 - def lstrip(self, chars=None):
238
239 - def rstrip(self, chars=None):
241
242 - def strip(self, chars=None):
244
247
248 - def count(self, sub, start=0, end=None):
249 unistr = unicode(self, 'utf-8')
250 return unistr.count(
251 unicode(sub, 'utf-8') if isinstance(sub, str) else sub,
252 start, len(unistr) if end is None else end)
253
254 - def decode(self, encoding='utf-8', errors='strict'):
255 return str.decode(self, encoding, errors)
256
257 - def encode(self, encoding, errors='strict'):
258 return unicode(self, 'utf-8').encode(encoding, errors)
259
262
263 - def find(self, sub, start=None, end=None):
264 return unicode(self, 'utf-8').find(unicode(sub, 'utf-8')
265 if isinstance(sub, str) else sub, start, end)
266
267 - def ljust(self, width, fillchar=' '):
268 return str.__new__(Utf8, unicode(self, 'utf-8').ljust(width, unicode(fillchar, 'utf-8')
269 if isinstance(fillchar, str) else fillchar).encode('utf-8'))
270
276
277 - def replace(self, old, new, count=-1):
279
280 - def rfind(self, sub, start=None, end=None):
281 return unicode(self, 'utf-8').rfind(unicode(sub, 'utf-8')
282 if isinstance(sub, str) else sub, start, end)
283
285 return unicode(self, 'utf-8').rindex(string if isinstance(string, unicode)
286 else unicode(string, 'utf-8'))
287
288 - def rjust(self, width, fillchar=' '):
289 return str.__new__(Utf8, unicode(self, 'utf-8').rjust(width, unicode(fillchar, 'utf-8')
290 if isinstance(fillchar, str) else fillchar).encode('utf-8'))
291
297
298 - def rsplit(self, sep=None, maxsplit=-1):
299 return [str.__new__(Utf8, part) for part in str.rsplit(self,
300 None if sep is None else Utf8(sep), maxsplit)]
301
302 - def split(self, sep=None, maxsplit=-1):
303 return [str.__new__(Utf8, part) for part in str.split(self,
304 None if sep is None else Utf8(sep), maxsplit)]
305
308
310 unistr = unicode(self, 'utf-8')
311 if isinstance(prefix, tuple):
312 prefix = tuple(unicode(
313 s, 'utf-8') if isinstance(s, str) else s for s in prefix)
314 elif isinstance(prefix, str):
315 prefix = unicode(prefix, 'utf-8')
316 return unistr.startswith(prefix, start, len(unistr) if end is None else end)
317
323
324 - def endswith(self, prefix, start=0, end=None):
325 unistr = unicode(self, 'utf-8')
326 if isinstance(prefix, tuple):
327 prefix = tuple(unicode(
328 s, 'utf-8') if isinstance(s, str) else s for s in prefix)
329 elif isinstance(prefix, str):
330 prefix = unicode(prefix, 'utf-8')
331 return unistr.endswith(prefix, start, len(unistr) if end is None else end)
332 if hasattr(str, 'format'):
341
343 if isinstance(right, tuple):
344 right = tuple(unicode(v, 'utf-8') if isinstance(v, str) else v
345 for v in right)
346 elif isinstance(right, dict):
347 right = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
348 unicode(v, 'utf-8') if isinstance(v, str) else v)
349 for k, v in right.iteritems())
350 elif isinstance(right, str):
351 right = unicode(right, 'utf-8')
352 return str.__new__(Utf8, unicode(self, 'utf-8').__mod__(right).encode('utf-8'))
353
356
359
362
365
366
367 if __name__ == '__main__':
369 u"""
370 doctests:
371 >>> test_unicode=u'ПРоба Є PRobe'
372 >>> test_unicode_word=u'ПРоба'
373 >>> test_number_str='12345'
374 >>> test_unicode
375 u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
376 >>> print test_unicode
377 ПРоба Є PRobe
378 >>> test_word=test_unicode_word.encode('utf-8')
379 >>> test_str=test_unicode.encode('utf-8')
380 >>> s=Utf8(test_str)
381 >>> s
382 'ПРоба Є PRobe'
383 >>> type(s)
384 <class '__main__.Utf8'>
385 >>> s == test_str
386 True
387 >>> len(test_str) # wrong length of utf8-string!
388 19
389 >>> len(test_unicode) # RIGHT!
390 13
391 >>> len(s) # RIGHT!
392 13
393 >>> size(test_str) # size of utf-8 string (in bytes) == len(str)
394 19
395 >>> size(test_unicode) # size of unicode string in bytes (packed to utf-8 string)
396 19
397 >>> size(s) # size of utf-8 string in bytes
398 19
399 >>> try: # utf-8 is a multibyte string. Convert it to unicode for use with builtin ord()
400 ... __builtin__.ord('б') # ascii string
401 ... except Exception, e:
402 ... print 'Exception:', e
403 Exception: ord() expected a character, but string of length 2 found
404 >>> ord('б') # utf8.ord() is used(!!!)
405 1073
406 >>> ord(u'б') # utf8.ord() is used(!!!)
407 1073
408 >>> ord(s[3]) # utf8.ord() is used(!!!)
409 1073
410 >>> chr(ord(s[3])) # utf8.chr() and utf8.chr() is used(!!!)
411 'б'
412 >>> type(chr(1073)) # utf8.chr() is used(!!!)
413 <class '__main__.Utf8'>
414 >>> s=Utf8(test_unicode)
415 >>> s
416 'ПРоба Є PRobe'
417 >>> s == test_str
418 True
419 >>> test_str == s
420 True
421 >>> s == test_unicode
422 True
423 >>> test_unicode == s
424 True
425 >>> print test_str.upper() # only ASCII characters uppered
426 ПРоба Є PROBE
427 >>> print test_unicode.upper() # unicode gives right result
428 ПРОБА Є PROBE
429 >>> s.upper() # utf8 class use unicode.upper()
430 'ПРОБА Є PROBE'
431 >>> type(s.upper())
432 <class '__main__.Utf8'>
433 >>> s.lower()
434 'проба є probe'
435 >>> type(s.lower())
436 <class '__main__.Utf8'>
437 >>> s.capitalize()
438 'Проба є probe'
439 >>> type(s.capitalize())
440 <class '__main__.Utf8'>
441 >>> len(s)
442 13
443 >>> len(test_unicode)
444 13
445 >>> s+'. Probe is проба'
446 'ПРоба Є PRobe. Probe is проба'
447 >>> type(s+'. Probe is проба')
448 <class '__main__.Utf8'>
449 >>> s+u'. Probe is проба'
450 'ПРоба Є PRobe. Probe is проба'
451 >>> type(s+u'. Probe is проба')
452 <class '__main__.Utf8'>
453 >>> s+s
454 'ПРоба Є PRobeПРоба Є PRobe'
455 >>> type(s+s)
456 <class '__main__.Utf8'>
457 >>> a=s
458 >>> a+=s
459 >>> a+=test_unicode
460 >>> a+=test_str
461 >>> a
462 'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
463 >>> type(a)
464 <class '__main__.Utf8'>
465 >>> s*3
466 'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
467 >>> type(s*3)
468 <class '__main__.Utf8'>
469 >>> a=Utf8("-проба-")
470 >>> a*=10
471 >>> a
472 '-проба--проба--проба--проба--проба--проба--проба--проба--проба--проба-'
473 >>> type(a)
474 <class '__main__.Utf8'>
475 >>> print "'"+test_str.center(17)+"'" # WRONG RESULT!
476 'ПРоба Є PRobe'
477 >>> s.center(17) # RIGHT!
478 ' ПРоба Є PRobe '
479 >>> type(s.center(17))
480 <class '__main__.Utf8'>
481 >>> (test_word+test_number_str).isalnum() # WRONG RESULT! non ASCII chars are detected as non alpha
482 False
483 >>> Utf8(test_word+test_number_str).isalnum()
484 True
485 >>> s.isalnum()
486 False
487 >>> test_word.isalpha() # WRONG RESULT! Non ASCII characters are detected as non alpha
488 False
489 >>> Utf8(test_word).isalpha() # RIGHT!
490 True
491 >>> s.lower().islower()
492 True
493 >>> s.upper().isupper()
494 True
495 >>> print test_str.zfill(17) # WRONG RESULT!
496 ПРоба Є PRobe
497 >>> s.zfill(17) # RIGHT!
498 '0000ПРоба Є PRobe'
499 >>> type(s.zfill(17))
500 <class '__main__.Utf8'>
501 >>> s.istitle()
502 False
503 >>> s.title().istitle()
504 True
505 >>> Utf8('1234').isdigit()
506 True
507 >>> Utf8(' \t').isspace()
508 True
509 >>> s.join('•|•')
510 '•ПРоба Є PRobe|ПРоба Є PRobe•'
511 >>> s.join((str('(utf8 тест1)'), unicode('(unicode тест2)','utf-8'), '(ascii test3)'))
512 '(utf8 тест1)ПРоба Є PRobe(unicode тест2)ПРоба Є PRobe(ascii test3)'
513 >>> type(s)
514 <class '__main__.Utf8'>
515 >>> s==test_str
516 True
517 >>> s==test_unicode
518 True
519 >>> s.swapcase()
520 'прОБА є prOBE'
521 >>> type(s.swapcase())
522 <class '__main__.Utf8'>
523 >>> truncate(s, 10)
524 'ПРоба Є...'
525 >>> truncate(s, 20)
526 'ПРоба Є PRobe'
527 >>> truncate(s, 10, '•••') # utf-8 string as *dots*
528 'ПРоба Є•••'
529 >>> truncate(s, 10, u'®') # you can use unicode string as *dots*
530 'ПРоба Є P®'
531 >>> type(truncate(s, 10))
532 <class '__main__.Utf8'>
533 >>> Utf8(s.encode('koi8-u'), 'koi8-u')
534 'ПРоба Є PRobe'
535 >>> s.decode() # convert utf-8 string to unicode
536 u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
537 >>> a='про\\tba'
538 >>> str_tmp=a.expandtabs()
539 >>> utf8_tmp=Utf8(a).expandtabs()
540 >>> utf8_tmp.replace(' ','.') # RIGHT! (default tabsize is 8)
541 'про.....ba'
542 >>> utf8_tmp.index('b')
543 8
544 >>> print "'"+str_tmp.replace(' ','.')+"'" # WRONG STRING LENGTH!
545 'про..ba'
546 >>> str_tmp.index('b') # WRONG index of 'b' character
547 8
548 >>> print "'"+a.expandtabs(4).replace(' ','.')+"'" # WRONG RESULT!
549 'про..ba'
550 >>> Utf8(a).expandtabs(4).replace(' ','.') # RIGHT!
551 'про.ba'
552 >>> s.find('Є')
553 6
554 >>> s.find(u'Є')
555 6
556 >>> s.find(' ', 6)
557 7
558 >>> s.rfind(' ')
559 7
560 >>> s.partition('Є')
561 ('ПРоба ', 'Є', ' PRobe')
562 >>> s.partition(u'Є')
563 ('ПРоба ', 'Є', ' PRobe')
564 >>> (a,b,c) = s.partition('Є')
565 >>> type(a), type(b), type(c)
566 (<class '__main__.Utf8'>, <class '__main__.Utf8'>, <class '__main__.Utf8'>)
567 >>> s.partition(' ')
568 ('ПРоба', ' ', 'Є PRobe')
569 >>> s.rpartition(' ')
570 ('ПРоба Є', ' ', 'PRobe')
571 >>> s.index('Є')
572 6
573 >>> s.rindex(u'Є')
574 6
575 >>> s.index(' ')
576 5
577 >>> s.rindex(' ')
578 7
579 >>> a=Utf8('а б ц д е а б ц д е а\\tб ц д е')
580 >>> a.split()
581 ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д',
582 'е', 'а', 'б', 'ц', 'д', 'е']
583 >>> a.rsplit()
584 ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д',
585 'е', 'а', 'б', 'ц', 'д', 'е']
586 >>> a.expandtabs().split('б')
587 ['а ', ' ц д е а ', ' ц д е а ', ' ц д е']
588 >>> a.expandtabs().rsplit('б')
589 ['а ', ' ц д е а ', ' ц д е а ', ' ц д е']
590 >>> a.expandtabs().split(u'б', 1)
591 ['а ', ' ц д е а б ц д е а б ц д е']
592 >>> a.expandtabs().rsplit(u'б', 1)
593 ['а б ц д е а б ц д е а ', ' ц д е']
594 >>> a=Utf8("рядок1\\nрядок2\\nрядок3")
595 >>> a.splitlines()
596 ['рядок1', 'рядок2', 'рядок3']
597 >>> a.splitlines(True)
598 ['рядок1\\n', 'рядок2\\n', 'рядок3']
599 >>> s[6]
600 'Є'
601 >>> s[0]
602 'П'
603 >>> s[-1]
604 'e'
605 >>> s[:10]
606 'ПРоба Є PR'
607 >>> s[2:-2:2]
608 'оаЄPo'
609 >>> s[::-1]
610 'eboRP Є абоРП'
611 >>> s.startswith('ПР')
612 True
613 >>> s.startswith(('ПР', u'об'),0)
614 True
615 >>> s.startswith(u'об', 2, 4)
616 True
617 >>> s.endswith('be')
618 True
619 >>> s.endswith(('be', 'PR', u'Є'))
620 True
621 >>> s.endswith('PR', 8, 10)
622 True
623 >>> s.endswith('Є', -7, -6)
624 True
625 >>> s.count(' ')
626 2
627 >>> s.count(' ',6)
628 1
629 >>> s.count(u'Є')
630 1
631 >>> s.count('Є', 0, 5)
632 0
633 >>> Utf8(
634 "Parameters: '%(проба)s', %(probe)04d, %(проба2)s") % { u"проба": s,
635 ... "not used": "???", "probe": 2, "проба2": u"ПРоба Probe" }
636 "Parameters: 'ПРоба Є PRobe', 0002, ПРоба Probe"
637 >>> a=Utf8(u"Параметр: (%s)-(%s)-[%s]")
638 >>> a%=(s, s[::-1], 1000)
639 >>> a
640 'Параметр: (ПРоба Є PRobe)-(eboRP Є абоРП)-[1000]'
641 >>> if hasattr(Utf8, 'format'):
642 ... Utf8("Проба <{0}>, {1}, {param1}, {param2}").format(s, u"中文字",
643 ... param1="барабан", param2=1000) == 'Проба <ПРоба Є PRobe>, 中文字, барабан, 1000'
644 ... else: # format() method is not used in python with version <2.6:
645 ... print True
646 True
647 >>> u'Б'<u'Ї' # WRONG ORDER!
648 False
649 >>> 'Б'<'Ї' # WRONG ORDER!
650 False
651 >>> Utf8('Б')<'Ї' # RIGHT!
652 True
653 >>> u'д'>u'ґ' # WRONG ORDER!
654 False
655 >>> Utf8('д')>Utf8('ґ') # RIGHT!
656 True
657 >>> u'є'<=u'ж' # WRONG ORDER!
658 False
659 >>> Utf8('є')<=u'ж' # RIGHT!
660 True
661 >>> Utf8('є')<=u'є'
662 True
663 >>> u'Ї'>=u'И' # WRONG ORDER!
664 False
665 >>> Utf8(u'Ї') >= u'И' # RIGHT
666 True
667 >>> Utf8('Є') >= 'Є'
668 True
669 >>> a="яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # str type
670 >>> b=u"яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # unicode type
671 >>> c=Utf8("яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ") # utf8 class
672 >>> result = "".join(sorted(a))
673 >>> result[0:20] # result is not utf8 string, because bytes, not utf8-characters were sorted
674 '\\x80\\x81\\x82\\x83\\x84\\x84\\x85\\x86\\x86\\x87\\x87\\x88\\x89\\x8c\\x8e\\x8f\\x90\\x90\\x91\\x91'
675 >>> try:
676 ... unicode(result, 'utf-8') # try to convert result (utf-8?) to unicode
677 ... except Exception, e:
678 ... print 'Exception:', e
679 Exception: 'utf8' codec can't decode byte 0x80 in position 0: unexpected code byte
680 >>> try: # FAILED! (working with bytes, not with utf8-charactes)
681 ... "".join( sorted(a, key=sort_key) ) # utf8.sort_key may be used with utf8 or unicode strings only!
682 ... except Exception, e:
683 ... print 'Exception:', e
684 Exception: 'utf8' codec can't decode byte 0xd1 in position 0: unexpected end of data
685 >>> print "".join( sorted(Utf8(a))) # converting *a* to unicode or utf8-string gives us correct result
686 аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
687 >>> print u"".join( sorted(b) ) # WRONG ORDER! Default sort key is used
688 ЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгдежзийклмнопрстуфхцчшщьюяєіїҐґ
689 >>> print u"".join( sorted(b, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
690 аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
691 >>> print "".join( sorted(c) ) # RIGHT ORDER! Utf8 "rich comparison" methods are used
692 аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
693 >>> print "".join( sorted(c, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
694 аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
695 >>> Utf8().join(sorted(c.decode(), key=sort_key)) # convert to unicode for better performance
696 'аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ'
697 >>> for result in sorted(
698 ["Іа", "Астро", u"гала", Utf8("Гоша"), "Єва", "шовк", "аякс", "Їжа",
699 ... "ґанок", Utf8("Дар'я"), "білінг", "веб", u"Жужа", "проба", u"тест",
700 ... "абетка", "яблуко", "Юляся", "Київ", "лимонад", "ложка", "Матриця",
701 ... ], key=sort_key):
702 ... print result.ljust(20), type(result)
703 абетка <type 'str'>
704 Астро <type 'str'>
705 аякс <type 'str'>
706 білінг <type 'str'>
707 веб <type 'str'>
708 гала <type 'unicode'>
709 ґанок <type 'str'>
710 Гоша <class '__main__.Utf8'>
711 Дар'я <class '__main__.Utf8'>
712 Єва <type 'str'>
713 Жужа <type 'unicode'>
714 Іа <type 'str'>
715 Їжа <type 'str'>
716 Київ <type 'str'>
717 лимонад <type 'str'>
718 ложка <type 'str'>
719 Матриця <type 'str'>
720 проба <type 'str'>
721 тест <type 'unicode'>
722 шовк <type 'str'>
723 Юляся <type 'str'>
724 яблуко <type 'str'>
725 >>> a=Utf8("中文字")
726 >>> L=list(a)
727 >>> L
728 ['中', '文', '字']
729 >>> a="".join(L)
730 >>> print a
731 中文字
732 >>> type(a)
733 <type 'str'>
734 >>> a="中文字" # standard str type
735 >>> L=list(a)
736 >>> L
737 ['\\xe4', '\\xb8', '\\xad', '\\xe6', '\\x96', '\\x87',
738 '\\xe5', '\\xad', '\\x97']
739 >>> from string import maketrans
740 >>> str_tab=maketrans('PRobe','12345')
741 >>> unicode_tab={ord(u'П'):ord(u'Ж'),
742 ... ord(u'Р') : u'Ш',
743 ... ord(Utf8('о')) : None, # utf8.ord() is used
744 ... ord('б') : None, # -//-//-
745 ... ord(u'а') : u"中文字",
746 ... ord(u'Є') : Utf8('•').decode(), # only unicode type is supported
747 ... }
748 >>> s.translate(unicode_tab).translate(str_tab, deletechars=' ')
749 'ЖШ中文字•12345'
750 """
751 import sys
752 reload(sys)
753 sys.setdefaultencoding("UTF-8")
754 import doctest
755 print "DOCTESTS STARTED..."
756 doctest.testmod()
757 print "DOCTESTS FINISHED"
758
759 doctests()
760