12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- def generate_pw_tokenizer_65599_fixed_length_hash_macro(hash_length):
- """Generate macro that hashes a string literal using a modified x65599 hash.
- The macros generated by this function only operate on string literals.
- Since macros can only operate on fixed-length strings, the hash macro only
- hashes up to a fixed length, and characters beyond that length are ignored.
- To eliminate some collisions, the length of the string is hashed as if it
- were the first character.
- This hash is calculated with the following equation, where s is the string
- and k is the maximum hash length:
- H(s, k) = len(s) + 65599 * s[0] + 65599^2 * s[1] + ... + 65599^k * s[k-1]
- The hash algorithm is a modified version of the x65599 hash used by the SDBM
- open source project. This hash has the following differences from x65599:
- - Characters are only hashed up to a fixed maximum string length.
- - Characters are hashed in reverse order.
- - The string length is hashed as the first character in the string.
- The code generated by this function is intentionally sparse. Each character
- appears hash_length times per log message, so using fewer characters results
- in faster compilation times.
- Args:
- hash_length: maximum string size to hash; extra characters are ignored
- Returns:
- the macro header file as a string
- """
- first_hash_term = ('(uint32_t)(sizeof(str "") - 1 + '
- '/* The argument must be a string literal. */ \\\n')
- # Use this to add the aligned backslash at the end of the macro lines.
- line_format = '{{:<{}}}\\\n'.format(len(first_hash_term))
- lines = [
- FILE_HEADER.format(script=os.path.basename(__file__),
- hash_length=hash_length,
- year=datetime.date.today().year)
- ]
- lines.append(
- line_format.format('#define {}_{}_HASH(str)'.format(
- HASH_NAME.upper(), hash_length)))
- lines.append(' ' + first_hash_term) # add indendation and the macro line
- indent = ' ' * len(' (uint32_t)(')
- coefficient_format = '0x{coefficient:0>8x}u'
- # The string will have at least a null terminator
- lines.append(
- line_format.format('{}0x{:0>8x}u * (uint8_t)str[0] +'.format(
- indent, HASH_CONSTANT)))
- # Format string to use for the remaining terms.
- term_format = (
- '{indent}{coefficient} * '
- '(uint8_t)({index} < sizeof(str) ? str[{index}] : 0) +').format(
- indent=indent,
- coefficient=coefficient_format,
- index='{{index:>{}}}'.format(len(str(hash_length - 1))))
- for i in range(1, hash_length):
- coefficient = HASH_CONSTANT**(i + 1) % 2**32
- term = term_format.format(index=i, coefficient=coefficient)
- lines.append(line_format.format(term))
- # Remove the extra + and \ and add the closing )
- lines[-1] = lines[-1].rstrip(' +\\\n') + ')'
- lines.append('\n\n// clang-format on\n')
- return ''.join(lines)
|