truncate_2.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. def truncate_by_backing_up_bytes(s, max_bytes):
  2. """
  3. Ensure that the UTF-8 encoding of a string has not more than
  4. max_bytes bytes
  5. :param s: The string
  6. :param max_bytes: Maximal number of bytes
  7. :return: The cut string
  8. """
  9. def safe_b_of_i(b, i):
  10. try:
  11. return b[i]
  12. except IndexError:
  13. return 0
  14. # Edge cases
  15. if s == '' or max_bytes < 1:
  16. return ''
  17. # cut it twice to avoid encoding potentially GBs of `s` just to get e.g. 10 bytes?
  18. b = s[:max_bytes].encode('utf-8')[:max_bytes]
  19. if b[-1] & 0b10000000:
  20. last_11xxxxxx_index = [
  21. i
  22. for i in range(-1, -5, -1)
  23. if safe_b_of_i(b, i) & 0b11000000 == 0b11000000
  24. ][0]
  25. # note that last_11xxxxxx_index is negative
  26. last_11xxxxxx = b[last_11xxxxxx_index]
  27. if not last_11xxxxxx & 0b00100000:
  28. last_char_length = 2
  29. elif not last_11xxxxxx & 0b0010000:
  30. last_char_length = 3
  31. elif not last_11xxxxxx & 0b0001000:
  32. last_char_length = 4
  33. if last_char_length > -last_11xxxxxx_index:
  34. # remove the incomplete character
  35. b = b[:last_11xxxxxx_index]
  36. return b.decode('utf-8')