gluon.utf8

1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 """ 4 This file is part of the web2py Web Framework 5 Copyrighted by Massimo Di Pierro <mdipierro@cs.depaul.edu> 6 License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html) 7 8 Created by Vladyslav Kozlovskyy (Ukraine) <dbdevelop©gmail.com> 9 for Web2py project 10 11 Utilities and class for UTF8 strings managing 12 =========================================== 13 """ 14 import __builtin__ 15 __all__ = ['Utf8'] 16 17 repr_escape_tab = {} 18 for i in range(1, 32): 19 repr_escape_tab[i] = ur'\x%02x' % i 20 repr_escape_tab[7] = u'\\a' 21 repr_escape_tab[8] = u'\\b' 22 repr_escape_tab[9] = u'\\t' 23 repr_escape_tab[10] = u'\\n' 24 repr_escape_tab[11] = u'\\v' 25 repr_escape_tab[12] = u'\\f' 26 repr_escape_tab[13] = u'\\r' 27 repr_escape_tab[ord('\\')] = u'\\\\' 28 repr_escape_tab2 = repr_escape_tab.copy() 29 repr_escape_tab2[ord('\'')] = u"\\'" 30 31

32 -def sort_key(s):

33 """ Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/) 34 is used for utf-8 and unicode strings sorting and for utf-8 strings 35 comparison 36 37 NOTE: pyuca is a very memory cost module! It loads the whole 38 "allkey.txt" file (~2mb!) into the memory. But this 39 functionality is needed only when sort_key() is called as a 40 part of sort() function or when Utf8 strings are compared. 41 42 So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS 43 FIRST CALL) imports pyuca and replaces itself with a real 44 sort_key() function 45 """ 46 global sort_key 47 try: 48 from gluon.contrib.pyuca import unicode_collator 49 unicode_sort_key = unicode_collator.sort_key 50 sort_key = lambda s: unicode_sort_key( 51 unicode(s, 'utf-8') if isinstance(s, str) else s) 52 except: 53 sort_key = lambda s: ( 54 unicode(s, 'utf-8') if isinstance(s, str) else s).lower() 55 return sort_key(s)

56 57

58 -def ord(char):

59 """ returns unicode id for utf8 or unicode *char* character 60 61 SUPPOSE that *char* is an utf-8 or unicode character only 62 """ 63 if isinstance(char, unicode): 64 return __builtin__.ord(char) 65 return __builtin__.ord(unicode(char, 'utf-8'))

66 67

68 -def chr(code):

69 """ return utf8-character with *code* unicode id """ 70 return Utf8(unichr(code))

71 72

73 -def size(string):

74 """ return length of utf-8 string in bytes 75 NOTE! The length of correspondent utf-8 76 string is returned for unicode string 77 """ 78 return Utf8(string).__size__()

79 80

81 -def truncate(string, length, dots='...'):

82 """ returns string of length < *length* or truncate 83 string with adding *dots* suffix to the string's end 84 85 args: 86 length (int): max length of string 87 dots (str or unicode): string suffix, when string is cutted 88 89 returns: 90 (utf8-str): original or cutted string 91 """ 92 text = unicode(string, 'utf-8') 93 dots = unicode(dots, 'utf-8') if isinstance(dots, str) else dots 94 if len(text) > length: 95 text = text[:length - len(dots)] + dots 96 return str.__new__(Utf8, text.encode('utf-8'))

97 98

99 -class Utf8(str):

100 """ 101 Class for utf8 string storing and manipulations 102 103 The base presupposition of this class usage is: 104 "ALL strings in the application are either of 105 utf-8 or unicode type, even when simple str 106 type is used. UTF-8 is only a "packed" version 107 of unicode, so Utf-8 and unicode strings are 108 interchangeable." 109 110 CAUTION! This class is slower than str/unicode! 111 Do NOT use it inside intensive loops. Simply 112 decode string(s) to unicode before loop and 113 encode it back to utf-8 string(s) after 114 intensive calculation. 115 116 You can see the benefit of this class in doctests() below 117 """

118 - def __new__(cls, content='', codepage='utf-8'):

119 if isinstance(content, unicode): 120 return str.__new__(cls, unicode.encode(content, 'utf-8')) 121 elif codepage in ('utf-8', 'utf8') or isinstance(content, cls): 122 return str.__new__(cls, content) 123 else: 124 return str.__new__(cls, unicode(content, codepage).encode('utf-8'))

125

126 - def __repr__(self):

127 r''' # note that we use raw strings to avoid having to use double back slashes below 128 NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function 129 130 utf8.__repr__() works same as str.repr() when processing ascii string 131 >>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'" 132 True 133 >>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\'' 134 True 135 >>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"' 136 True 137 >>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\'' 138 True 139 >>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n 140 True 141 142 Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string 143 >>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字') 144 True 145 >>> repr(Utf8('中"文"字')) == "'中\"文\"字'" != repr('中"文"字') 146 True 147 >>> repr(Utf8("中'文'字")) == '"中\'文\'字"' != repr("中'文'字") 148 True 149 >>> repr(Utf8('中\'文"字')) == repr(Utf8("中'文\"字")) == '\'中\\\'文"字\'' != repr('中\'文"字') == repr("中'文\"字") 150 True 151 >>> repr(Utf8('中\r\n文')) == "'中\\r\\n文'" != repr('中\r\n文') # Test for \r, \n 152 True 153 ''' 154 if str.find(self, "'") >= 0 and str.find(self, '"') < 0: # only single quote exists 155 return '"' + unicode(self, 'utf-8').translate(repr_escape_tab).encode('utf-8') + '"' 156 else: 157 return "'" + unicode(self, 'utf-8').translate(repr_escape_tab2).encode('utf-8') + "'"

158

159 - def __size__(self):

160 """ length of utf-8 string in bytes """ 161 return str.__len__(self)

162

163 - def __contains__(self, other):

164 return str.__contains__(self, Utf8(other))

165

166 - def __getitem__(self, index):

167 return str.__new__(Utf8, unicode(self, 'utf-8')[index].encode('utf-8'))

168

169 - def __getslice__(self, begin, end):

170 return str.__new__(Utf8, unicode(self, 'utf-8')[begin:end].encode('utf-8'))

171

172 - def __add__(self, other):

173 return str.__new__(Utf8, str.__add__(self, unicode.encode(other, 'utf-8') 174 if isinstance(other, unicode) else other))

175

176 - def __len__(self):

177 return len(unicode(self, 'utf-8'))

178

179 - def __mul__(self, integer):

180 return str.__new__(Utf8, str.__mul__(self, integer))

181

182 - def __eq__(self, string):

183 return str.__eq__(self, Utf8(string))

184

185 - def __ne__(self, string):

186 return str.__ne__(self, Utf8(string))

187

188 - def capitalize(self):

189 return str.__new__(Utf8, unicode(self, 'utf-8').capitalize().encode('utf-8'))

190

191 - def center(self, length):

192 return str.__new__(Utf8, unicode(self, 'utf-8').center(length).encode('utf-8'))

193

194 - def upper(self):

195 return str.__new__(Utf8, unicode(self, 'utf-8').upper().encode('utf-8'))

196

197 - def lower(self):

198 return str.__new__(Utf8, unicode(self, 'utf-8').lower().encode('utf-8'))

199

200 - def title(self):

201 return str.__new__(Utf8, unicode(self, 'utf-8').title().encode('utf-8'))

202

203 - def index(self, string):

204 return unicode(self, 'utf-8').index(string if isinstance(string, unicode) else unicode(string, 'utf-8'))

205

206 - def isalnum(self):

207 return unicode(self, 'utf-8').isalnum()

208

209 - def isalpha(self):

210 return unicode(self, 'utf-8').isalpha()

211

212 - def isdigit(self):

213 return unicode(self, 'utf-8').isdigit()

214

215 - def islower(self):

216 return unicode(self, 'utf-8').islower()

217

218 - def isspace(self):

219 return unicode(self, 'utf-8').isspace()

220

221 - def istitle(self):

222 return unicode(self, 'utf-8').istitle()

223

224 - def isupper(self):

225 return unicode(self, 'utf-8').isupper()

226

227 - def zfill(self, length):

228 return str.__new__(Utf8, unicode(self, 'utf-8').zfill(length).encode('utf-8'))

229

230 - def join(self, iter):

231 return str.__new__(Utf8, str.join(self, [Utf8(c) for c in 232 list(unicode(iter, 'utf-8') if 233 isinstance(iter, str) else 234 iter)]))

235

236 - def lstrip(self, chars=None):

237 return str.__new__(Utf8, str.lstrip(self, None if chars is None else Utf8(chars)))

238

239 - def rstrip(self, chars=None):

240 return str.__new__(Utf8, str.rstrip(self, None if chars is None else Utf8(chars)))

241

242 - def strip(self, chars=None):

243 return str.__new__(Utf8, str.strip(self, None if chars is None else Utf8(chars)))

244

245 - def swapcase(self):

246 return str.__new__(Utf8, unicode(self, 'utf-8').swapcase().encode('utf-8'))

247

248 - def count(self, sub, start=0, end=None):

249 unistr = unicode(self, 'utf-8') 250 return unistr.count( 251 unicode(sub, 'utf-8') if isinstance(sub, str) else sub, 252 start, len(unistr) if end is None else end)

253

254 - def decode(self, encoding='utf-8', errors='strict'):

255 return str.decode(self, encoding, errors)

256

257 - def encode(self, encoding, errors='strict'):

258 return unicode(self, 'utf-8').encode(encoding, errors)

259

260 - def expandtabs(self, tabsize=8):

261 return str.__new__(Utf8, unicode(self, 'utf-8').expandtabs(tabsize).encode('utf-8'))

262

263 - def find(self, sub, start=None, end=None):

264 return unicode(self, 'utf-8').find(unicode(sub, 'utf-8') 265 if isinstance(sub, str) else sub, start, end)

266

267 - def ljust(self, width, fillchar=' '):

268 return str.__new__(Utf8, unicode(self, 'utf-8').ljust(width, unicode(fillchar, 'utf-8') 269 if isinstance(fillchar, str) else fillchar).encode('utf-8'))

270

271 - def partition(self, sep):

272 (head, sep, tail) = str.partition(self, Utf8(sep)) 273 return (str.__new__(Utf8, head), 274 str.__new__(Utf8, sep), 275 str.__new__(Utf8, tail))

276

277 - def replace(self, old, new, count=-1):

278 return str.__new__(Utf8, str.replace(self, Utf8(old), Utf8(new), count))

279

280 - def rfind(self, sub, start=None, end=None):

281 return unicode(self, 'utf-8').rfind(unicode(sub, 'utf-8') 282 if isinstance(sub, str) else sub, start, end)

283

284 - def rindex(self, string):

285 return unicode(self, 'utf-8').rindex(string if isinstance(string, unicode) 286 else unicode(string, 'utf-8'))

287

288 - def rjust(self, width, fillchar=' '):

289 return str.__new__(Utf8, unicode(self, 'utf-8').rjust(width, unicode(fillchar, 'utf-8') 290 if isinstance(fillchar, str) else fillchar).encode('utf-8'))

291

292 - def rpartition(self, sep):

293 (head, sep, tail) = str.rpartition(self, Utf8(sep)) 294 return (str.__new__(Utf8, head), 295 str.__new__(Utf8, sep), 296 str.__new__(Utf8, tail))

297

298 - def rsplit(self, sep=None, maxsplit=-1):

299 return [str.__new__(Utf8, part) for part in str.rsplit(self, 300 None if sep is None else Utf8(sep), maxsplit)]

301

302 - def split(self, sep=None, maxsplit=-1):

303 return [str.__new__(Utf8, part) for part in str.split(self, 304 None if sep is None else Utf8(sep), maxsplit)]

305

306 - def splitlines(self, keepends=False):

307 return [str.__new__(Utf8, part) for part in str.splitlines(self, keepends)]

308

309 - def startswith(self, prefix, start=0, end=None):

310 unistr = unicode(self, 'utf-8') 311 if isinstance(prefix, tuple): 312 prefix = tuple(unicode( 313 s, 'utf-8') if isinstance(s, str) else s for s in prefix) 314 elif isinstance(prefix, str): 315 prefix = unicode(prefix, 'utf-8') 316 return unistr.startswith(prefix, start, len(unistr) if end is None else end)

317

318 - def translate(self, table, deletechars=''):

319 if isinstance(table, dict): 320 return str.__new__(Utf8, unicode(self, 'utf-8').translate(table).encode('utf-8')) 321 else: 322 return str.__new__(Utf8, str.translate(self, table, deletechars))

323

324 - def endswith(self, prefix, start=0, end=None):

325 unistr = unicode(self, 'utf-8') 326 if isinstance(prefix, tuple): 327 prefix = tuple(unicode( 328 s, 'utf-8') if isinstance(s, str) else s for s in prefix) 329 elif isinstance(prefix, str): 330 prefix = unicode(prefix, 'utf-8') 331 return unistr.endswith(prefix, start, len(unistr) if end is None else end)

332 if hasattr(str, 'format'): # Python 2.5 hasn't got str.format() method

333 - def format(self, *args, **kwargs):

334 args = [unicode( 335 s, 'utf-8') if isinstance(s, str) else s for s in args] 336 kwargs = dict((unicode(k, 'utf-8') if isinstance(k, str) else k, 337 unicode(v, 'utf-8') if isinstance(v, str) else v) 338 for k, v in kwargs.iteritems()) 339 return str.__new__(Utf8, unicode(self, 'utf-8'). 340 format(*args, **kwargs).encode('utf-8'))

341

342 - def __mod__(self, right):

343 if isinstance(right, tuple): 344 right = tuple(unicode(v, 'utf-8') if isinstance(v, str) else v 345 for v in right) 346 elif isinstance(right, dict): 347 right = dict((unicode(k, 'utf-8') if isinstance(k, str) else k, 348 unicode(v, 'utf-8') if isinstance(v, str) else v) 349 for k, v in right.iteritems()) 350 elif isinstance(right, str): 351 right = unicode(right, 'utf-8') 352 return str.__new__(Utf8, unicode(self, 'utf-8').__mod__(right).encode('utf-8'))

353

354 - def __ge__(self, string):

355 return sort_key(self) >= sort_key(string)

356

357 - def __gt__(self, string):

358 return sort_key(self) > sort_key(string)

359

360 - def __le__(self, string):

361 return sort_key(self) <= sort_key(string)

362

363 - def __lt__(self, string):

364 return sort_key(self) < sort_key(string)

365 366 367 if __name__ == '__main__':

368 - def doctests():

369 u""" 370 doctests: 371 >>> test_unicode=u'ПРоба Є PRobe' 372 >>> test_unicode_word=u'ПРоба' 373 >>> test_number_str='12345' 374 >>> test_unicode 375 u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe' 376 >>> print test_unicode 377 ПРоба Є PRobe 378 >>> test_word=test_unicode_word.encode('utf-8') 379 >>> test_str=test_unicode.encode('utf-8') 380 >>> s=Utf8(test_str) 381 >>> s 382 'ПРоба Є PRobe' 383 >>> type(s) 384 <class '__main__.Utf8'> 385 >>> s == test_str 386 True 387 >>> len(test_str) # wrong length of utf8-string! 388 19 389 >>> len(test_unicode) # RIGHT! 390 13 391 >>> len(s) # RIGHT! 392 13 393 >>> size(test_str) # size of utf-8 string (in bytes) == len(str) 394 19 395 >>> size(test_unicode) # size of unicode string in bytes (packed to utf-8 string) 396 19 397 >>> size(s) # size of utf-8 string in bytes 398 19 399 >>> try: # utf-8 is a multibyte string. Convert it to unicode for use with builtin ord() 400 ... __builtin__.ord('б') # ascii string 401 ... except Exception, e: 402 ... print 'Exception:', e 403 Exception: ord() expected a character, but string of length 2 found 404 >>> ord('б') # utf8.ord() is used(!!!) 405 1073 406 >>> ord(u'б') # utf8.ord() is used(!!!) 407 1073 408 >>> ord(s[3]) # utf8.ord() is used(!!!) 409 1073 410 >>> chr(ord(s[3])) # utf8.chr() and utf8.chr() is used(!!!) 411 'б' 412 >>> type(chr(1073)) # utf8.chr() is used(!!!) 413 <class '__main__.Utf8'> 414 >>> s=Utf8(test_unicode) 415 >>> s 416 'ПРоба Є PRobe' 417 >>> s == test_str 418 True 419 >>> test_str == s 420 True 421 >>> s == test_unicode 422 True 423 >>> test_unicode == s 424 True 425 >>> print test_str.upper() # only ASCII characters uppered 426 ПРоба Є PROBE 427 >>> print test_unicode.upper() # unicode gives right result 428 ПРОБА Є PROBE 429 >>> s.upper() # utf8 class use unicode.upper() 430 'ПРОБА Є PROBE' 431 >>> type(s.upper()) 432 <class '__main__.Utf8'> 433 >>> s.lower() 434 'проба є probe' 435 >>> type(s.lower()) 436 <class '__main__.Utf8'> 437 >>> s.capitalize() 438 'Проба є probe' 439 >>> type(s.capitalize()) 440 <class '__main__.Utf8'> 441 >>> len(s) 442 13 443 >>> len(test_unicode) 444 13 445 >>> s+'. Probe is проба' 446 'ПРоба Є PRobe. Probe is проба' 447 >>> type(s+'. Probe is проба') 448 <class '__main__.Utf8'> 449 >>> s+u'. Probe is проба' 450 'ПРоба Є PRobe. Probe is проба' 451 >>> type(s+u'. Probe is проба') 452 <class '__main__.Utf8'> 453 >>> s+s 454 'ПРоба Є PRobeПРоба Є PRobe' 455 >>> type(s+s) 456 <class '__main__.Utf8'> 457 >>> a=s 458 >>> a+=s 459 >>> a+=test_unicode 460 >>> a+=test_str 461 >>> a 462 'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe' 463 >>> type(a) 464 <class '__main__.Utf8'> 465 >>> s*3 466 'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe' 467 >>> type(s*3) 468 <class '__main__.Utf8'> 469 >>> a=Utf8("-проба-") 470 >>> a*=10 471 >>> a 472 '-проба--проба--проба--проба--проба--проба--проба--проба--проба--проба-' 473 >>> type(a) 474 <class '__main__.Utf8'> 475 >>> print "'"+test_str.center(17)+"'" # WRONG RESULT! 476 'ПРоба Є PRobe' 477 >>> s.center(17) # RIGHT! 478 ' ПРоба Є PRobe ' 479 >>> type(s.center(17)) 480 <class '__main__.Utf8'> 481 >>> (test_word+test_number_str).isalnum() # WRONG RESULT! non ASCII chars are detected as non alpha 482 False 483 >>> Utf8(test_word+test_number_str).isalnum() 484 True 485 >>> s.isalnum() 486 False 487 >>> test_word.isalpha() # WRONG RESULT! Non ASCII characters are detected as non alpha 488 False 489 >>> Utf8(test_word).isalpha() # RIGHT! 490 True 491 >>> s.lower().islower() 492 True 493 >>> s.upper().isupper() 494 True 495 >>> print test_str.zfill(17) # WRONG RESULT! 496 ПРоба Є PRobe 497 >>> s.zfill(17) # RIGHT! 498 '0000ПРоба Є PRobe' 499 >>> type(s.zfill(17)) 500 <class '__main__.Utf8'> 501 >>> s.istitle() 502 False 503 >>> s.title().istitle() 504 True 505 >>> Utf8('1234').isdigit() 506 True 507 >>> Utf8(' \t').isspace() 508 True 509 >>> s.join('•|•') 510 '•ПРоба Є PRobe|ПРоба Є PRobe•' 511 >>> s.join((str('(utf8 тест1)'), unicode('(unicode тест2)','utf-8'), '(ascii test3)')) 512 '(utf8 тест1)ПРоба Є PRobe(unicode тест2)ПРоба Є PRobe(ascii test3)' 513 >>> type(s) 514 <class '__main__.Utf8'> 515 >>> s==test_str 516 True 517 >>> s==test_unicode 518 True 519 >>> s.swapcase() 520 'прОБА є prOBE' 521 >>> type(s.swapcase()) 522 <class '__main__.Utf8'> 523 >>> truncate(s, 10) 524 'ПРоба Є...' 525 >>> truncate(s, 20) 526 'ПРоба Є PRobe' 527 >>> truncate(s, 10, '•••') # utf-8 string as *dots* 528 'ПРоба Є•••' 529 >>> truncate(s, 10, u'®') # you can use unicode string as *dots* 530 'ПРоба Є P®' 531 >>> type(truncate(s, 10)) 532 <class '__main__.Utf8'> 533 >>> Utf8(s.encode('koi8-u'), 'koi8-u') 534 'ПРоба Є PRobe' 535 >>> s.decode() # convert utf-8 string to unicode 536 u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe' 537 >>> a='про\\tba' 538 >>> str_tmp=a.expandtabs() 539 >>> utf8_tmp=Utf8(a).expandtabs() 540 >>> utf8_tmp.replace(' ','.') # RIGHT! (default tabsize is 8) 541 'про.....ba' 542 >>> utf8_tmp.index('b') 543 8 544 >>> print "'"+str_tmp.replace(' ','.')+"'" # WRONG STRING LENGTH! 545 'про..ba' 546 >>> str_tmp.index('b') # WRONG index of 'b' character 547 8 548 >>> print "'"+a.expandtabs(4).replace(' ','.')+"'" # WRONG RESULT! 549 'про..ba' 550 >>> Utf8(a).expandtabs(4).replace(' ','.') # RIGHT! 551 'про.ba' 552 >>> s.find('Є') 553 6 554 >>> s.find(u'Є') 555 6 556 >>> s.find(' ', 6) 557 7 558 >>> s.rfind(' ') 559 7 560 >>> s.partition('Є') 561 ('ПРоба ', 'Є', ' PRobe') 562 >>> s.partition(u'Є') 563 ('ПРоба ', 'Є', ' PRobe') 564 >>> (a,b,c) = s.partition('Є') 565 >>> type(a), type(b), type(c) 566 (<class '__main__.Utf8'>, <class '__main__.Utf8'>, <class '__main__.Utf8'>) 567 >>> s.partition(' ') 568 ('ПРоба', ' ', 'Є PRobe') 569 >>> s.rpartition(' ') 570 ('ПРоба Є', ' ', 'PRobe') 571 >>> s.index('Є') 572 6 573 >>> s.rindex(u'Є') 574 6 575 >>> s.index(' ') 576 5 577 >>> s.rindex(' ') 578 7 579 >>> a=Utf8('а б ц д е а б ц д е а\\tб ц д е') 580 >>> a.split() 581 ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 582 'е', 'а', 'б', 'ц', 'д', 'е'] 583 >>> a.rsplit() 584 ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 585 'е', 'а', 'б', 'ц', 'д', 'е'] 586 >>> a.expandtabs().split('б') 587 ['а ', ' ц д е а ', ' ц д е а ', ' ц д е'] 588 >>> a.expandtabs().rsplit('б') 589 ['а ', ' ц д е а ', ' ц д е а ', ' ц д е'] 590 >>> a.expandtabs().split(u'б', 1) 591 ['а ', ' ц д е а б ц д е а б ц д е'] 592 >>> a.expandtabs().rsplit(u'б', 1) 593 ['а б ц д е а б ц д е а ', ' ц д е'] 594 >>> a=Utf8("рядок1\\nрядок2\\nрядок3") 595 >>> a.splitlines() 596 ['рядок1', 'рядок2', 'рядок3'] 597 >>> a.splitlines(True) 598 ['рядок1\\n', 'рядок2\\n', 'рядок3'] 599 >>> s[6] 600 'Є' 601 >>> s[0] 602 'П' 603 >>> s[-1] 604 'e' 605 >>> s[:10] 606 'ПРоба Є PR' 607 >>> s[2:-2:2] 608 'оаЄPo' 609 >>> s[::-1] 610 'eboRP Є абоРП' 611 >>> s.startswith('ПР') 612 True 613 >>> s.startswith(('ПР', u'об'),0) 614 True 615 >>> s.startswith(u'об', 2, 4) 616 True 617 >>> s.endswith('be') 618 True 619 >>> s.endswith(('be', 'PR', u'Є')) 620 True 621 >>> s.endswith('PR', 8, 10) 622 True 623 >>> s.endswith('Є', -7, -6) 624 True 625 >>> s.count(' ') 626 2 627 >>> s.count(' ',6) 628 1 629 >>> s.count(u'Є') 630 1 631 >>> s.count('Є', 0, 5) 632 0 633 >>> Utf8( 634 "Parameters: '%(проба)s', %(probe)04d, %(проба2)s") % { u"проба": s, 635 ... "not used": "???", "probe": 2, "проба2": u"ПРоба Probe" } 636 "Parameters: 'ПРоба Є PRobe', 0002, ПРоба Probe" 637 >>> a=Utf8(u"Параметр: (%s)-(%s)-[%s]") 638 >>> a%=(s, s[::-1], 1000) 639 >>> a 640 'Параметр: (ПРоба Є PRobe)-(eboRP Є абоРП)-[1000]' 641 >>> if hasattr(Utf8, 'format'): 642 ... Utf8("Проба <{0}>, {1}, {param1}, {param2}").format(s, u"中文字", 643 ... param1="барабан", param2=1000) == 'Проба <ПРоба Є PRobe>, 中文字, барабан, 1000' 644 ... else: # format() method is not used in python with version <2.6: 645 ... print True 646 True 647 >>> u'Б'<u'Ї' # WRONG ORDER! 648 False 649 >>> 'Б'<'Ї' # WRONG ORDER! 650 False 651 >>> Utf8('Б')<'Ї' # RIGHT! 652 True 653 >>> u'д'>u'ґ' # WRONG ORDER! 654 False 655 >>> Utf8('д')>Utf8('ґ') # RIGHT! 656 True 657 >>> u'є'<=u'ж' # WRONG ORDER! 658 False 659 >>> Utf8('є')<=u'ж' # RIGHT! 660 True 661 >>> Utf8('є')<=u'є' 662 True 663 >>> u'Ї'>=u'И' # WRONG ORDER! 664 False 665 >>> Utf8(u'Ї') >= u'И' # RIGHT 666 True 667 >>> Utf8('Є') >= 'Є' 668 True 669 >>> a="яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # str type 670 >>> b=u"яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # unicode type 671 >>> c=Utf8("яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ") # utf8 class 672 >>> result = "".join(sorted(a)) 673 >>> result[0:20] # result is not utf8 string, because bytes, not utf8-characters were sorted 674 '\\x80\\x81\\x82\\x83\\x84\\x84\\x85\\x86\\x86\\x87\\x87\\x88\\x89\\x8c\\x8e\\x8f\\x90\\x90\\x91\\x91' 675 >>> try: 676 ... unicode(result, 'utf-8') # try to convert result (utf-8?) to unicode 677 ... except Exception, e: 678 ... print 'Exception:', e 679 Exception: 'utf8' codec can't decode byte 0x80 in position 0: unexpected code byte 680 >>> try: # FAILED! (working with bytes, not with utf8-charactes) 681 ... "".join( sorted(a, key=sort_key) ) # utf8.sort_key may be used with utf8 or unicode strings only! 682 ... except Exception, e: 683 ... print 'Exception:', e 684 Exception: 'utf8' codec can't decode byte 0xd1 in position 0: unexpected end of data 685 >>> print "".join( sorted(Utf8(a))) # converting *a* to unicode or utf8-string gives us correct result 686 аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ 687 >>> print u"".join( sorted(b) ) # WRONG ORDER! Default sort key is used 688 ЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгдежзийклмнопрстуфхцчшщьюяєіїҐґ 689 >>> print u"".join( sorted(b, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used 690 аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ 691 >>> print "".join( sorted(c) ) # RIGHT ORDER! Utf8 "rich comparison" methods are used 692 аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ 693 >>> print "".join( sorted(c, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used 694 аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ 695 >>> Utf8().join(sorted(c.decode(), key=sort_key)) # convert to unicode for better performance 696 'аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ' 697 >>> for result in sorted( 698 ["Іа", "Астро", u"гала", Utf8("Гоша"), "Єва", "шовк", "аякс", "Їжа", 699 ... "ґанок", Utf8("Дар'я"), "білінг", "веб", u"Жужа", "проба", u"тест", 700 ... "абетка", "яблуко", "Юляся", "Київ", "лимонад", "ложка", "Матриця", 701 ... ], key=sort_key): 702 ... print result.ljust(20), type(result) 703 абетка <type 'str'> 704 Астро <type 'str'> 705 аякс <type 'str'> 706 білінг <type 'str'> 707 веб <type 'str'> 708 гала <type 'unicode'> 709 ґанок <type 'str'> 710 Гоша <class '__main__.Utf8'> 711 Дар'я <class '__main__.Utf8'> 712 Єва <type 'str'> 713 Жужа <type 'unicode'> 714 Іа <type 'str'> 715 Їжа <type 'str'> 716 Київ <type 'str'> 717 лимонад <type 'str'> 718 ложка <type 'str'> 719 Матриця <type 'str'> 720 проба <type 'str'> 721 тест <type 'unicode'> 722 шовк <type 'str'> 723 Юляся <type 'str'> 724 яблуко <type 'str'> 725 >>> a=Utf8("中文字") 726 >>> L=list(a) 727 >>> L 728 ['中', '文', '字'] 729 >>> a="".join(L) 730 >>> print a 731 中文字 732 >>> type(a) 733 <type 'str'> 734 >>> a="中文字" # standard str type 735 >>> L=list(a) 736 >>> L 737 ['\\xe4', '\\xb8', '\\xad', '\\xe6', '\\x96', '\\x87', 738 '\\xe5', '\\xad', '\\x97'] 739 >>> from string import maketrans 740 >>> str_tab=maketrans('PRobe','12345') 741 >>> unicode_tab={ord(u'П'):ord(u'Ж'), 742 ... ord(u'Р') : u'Ш', 743 ... ord(Utf8('о')) : None, # utf8.ord() is used 744 ... ord('б') : None, # -//-//- 745 ... ord(u'а') : u"中文字", 746 ... ord(u'Є') : Utf8('•').decode(), # only unicode type is supported 747 ... } 748 >>> s.translate(unicode_tab).translate(str_tab, deletechars=' ') 749 'ЖШ中文字•12345' 750 """ 751 import sys 752 reload(sys) 753 sys.setdefaultencoding("UTF-8") 754 import doctest 755 print "DOCTESTS STARTED..." 756 doctest.testmod() 757 print "DOCTESTS FINISHED"

758 759 doctests() 760

Source Code for Module gluon.utf8