1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- """
- The truncate functions
- """
- def truncate_by_concating(s, max_bytes):
- """
- Ensure that the UTF-8 encoding of a string has not more than
- max_bytes bytes
- :param s: The string
- :param max_bytes: Maximal number of bytes
- :return: The cut string
- """
- def len_as_bytes(s):
- return len(s.encode(errors='replace'))
- if len_as_bytes(s) <= max_bytes:
- return s
- res = ""
- for c in s:
- old = res
- res += c
- if len_as_bytes(res) > max_bytes:
- res = old
- break
- return res
- def truncate_by_backing_up_bytes(s, max_bytes):
- """
- Ensure that the UTF-8 encoding of a string has not more than
- max_bytes bytes
- :param s: The string
- :param max_bytes: Maximal number of bytes
- :return: The cut string
- """
- def safe_b_of_i(b, i):
- try:
- return b[i]
- except IndexError:
- return 0
- # Edge cases
- if s == '' or max_bytes < 1:
- return ''
- # cut it twice to avoid encoding potentially GBs of `s` just to get e.g. 10 bytes?
- b = s[:max_bytes].encode('utf-8')[:max_bytes]
- if b[-1] & 0b10000000:
- last_11xxxxxx_index = [
- i
- for i in range(-1, -5, -1)
- if safe_b_of_i(b, i) & 0b11000000 == 0b11000000
- ][0]
- # note that last_11xxxxxx_index is negative
- last_11xxxxxx = b[last_11xxxxxx_index]
- if not last_11xxxxxx & 0b00100000:
- last_char_length = 2
- elif not last_11xxxxxx & 0b0010000:
- last_char_length = 3
- elif not last_11xxxxxx & 0b0001000:
- last_char_length = 4
- if last_char_length > -last_11xxxxxx_index:
- # remove the incomplete character
- b = b[:last_11xxxxxx_index]
- return b.decode('utf-8')
- # List of the truncate functions
- truncate_funcs = [truncate_by_concating, truncate_by_backing_up_bytes]
|