truncate.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. """
  2. The truncate functions
  3. """
  4. def truncate_by_concating(s, max_bytes):
  5. """
  6. Ensure that the UTF-8 encoding of a string has not more than
  7. max_bytes bytes
  8. :param s: The string
  9. :param max_bytes: Maximal number of bytes
  10. :return: The cut string
  11. """
  12. def len_as_bytes(s):
  13. return len(s.encode(errors='replace'))
  14. if len_as_bytes(s) <= max_bytes:
  15. return s
  16. res = ""
  17. for c in s:
  18. old = res
  19. res += c
  20. if len_as_bytes(res) > max_bytes:
  21. res = old
  22. break
  23. return res
  24. def truncate_by_backing_up_bytes(s, max_bytes):
  25. """
  26. Ensure that the UTF-8 encoding of a string has not more than
  27. max_bytes bytes
  28. :param s: The string
  29. :param max_bytes: Maximal number of bytes
  30. :return: The cut string
  31. """
  32. def safe_b_of_i(b, i):
  33. try:
  34. return b[i]
  35. except IndexError:
  36. return 0
  37. # Edge cases
  38. if s == '' or max_bytes < 1:
  39. return ''
  40. # cut it twice to avoid encoding potentially GBs of `s` just to get e.g. 10 bytes?
  41. b = s[:max_bytes].encode('utf-8')[:max_bytes]
  42. if b[-1] & 0b10000000:
  43. last_11xxxxxx_index = [
  44. i
  45. for i in range(-1, -5, -1)
  46. if safe_b_of_i(b, i) & 0b11000000 == 0b11000000
  47. ][0]
  48. # note that last_11xxxxxx_index is negative
  49. last_11xxxxxx = b[last_11xxxxxx_index]
  50. if not last_11xxxxxx & 0b00100000:
  51. last_char_length = 2
  52. elif not last_11xxxxxx & 0b0010000:
  53. last_char_length = 3
  54. elif not last_11xxxxxx & 0b0001000:
  55. last_char_length = 4
  56. if last_char_length > -last_11xxxxxx_index:
  57. # remove the incomplete character
  58. b = b[:last_11xxxxxx_index]
  59. return b.decode('utf-8')
  60. # List of the truncate functions
  61. truncate_funcs = [truncate_by_concating, truncate_by_backing_up_bytes]