12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- def truncate_by_backing_up_bytes(s, max_bytes):
- """
- Ensure that the UTF-8 encoding of a string has not more than
- max_bytes bytes
- :param s: The string
- :param max_bytes: Maximal number of bytes
- :return: The cut string
- """
- def safe_b_of_i(b, i):
- try:
- return b[i]
- except IndexError:
- return 0
- # Edge cases
- if s == '' or max_bytes < 1:
- return ''
- # cut it twice to avoid encoding potentially GBs of `s` just to get e.g. 10 bytes?
- b = s[:max_bytes].encode('utf-8')[:max_bytes]
- if b[-1] & 0b10000000:
- last_11xxxxxx_index = [
- i
- for i in range(-1, -5, -1)
- if safe_b_of_i(b, i) & 0b11000000 == 0b11000000
- ][0]
- # note that last_11xxxxxx_index is negative
- last_11xxxxxx = b[last_11xxxxxx_index]
- if not last_11xxxxxx & 0b00100000:
- last_char_length = 2
- elif not last_11xxxxxx & 0b0010000:
- last_char_length = 3
- elif not last_11xxxxxx & 0b0001000:
- last_char_length = 4
- if last_char_length > -last_11xxxxxx_index:
- # remove the incomplete character
- b = b[:last_11xxxxxx_index]
- return b.decode('utf-8')
|