generate_hash_macro.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #!/usr/bin/env python3
  2. # Copyright 2020 The Pigweed Authors
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5. # use this file except in compliance with the License. You may obtain a copy of
  6. # the License at
  7. #
  8. # https://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. # License for the specific language governing permissions and limitations under
  14. # the License.
  15. """Generates a C macro for the PW tokenizer 65599 fixed length hash."""
  16. from __future__ import print_function
  17. import datetime
  18. import os
  19. HASH_CONSTANT = 65599
  20. HASH_NAME = 'pw_tokenizer_65599_fixed_length'
  21. HASH_LENGTHS = 80, 96, 128
  22. FILE_HEADER = """\
  23. // Copyright {year} The Pigweed Authors
  24. //
  25. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  26. // use this file except in compliance with the License. You may obtain a copy of
  27. // the License at
  28. //
  29. // https://www.apache.org/licenses/LICENSE-2.0
  30. //
  31. // Unless required by applicable law or agreed to in writing, software
  32. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  33. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  34. // License for the specific language governing permissions and limitations under
  35. // the License.
  36. // AUTOGENERATED - DO NOT EDIT
  37. //
  38. // This file was generated by {script}.
  39. // To make changes, update the script and run it to regenerate the files.
  40. #pragma once
  41. #include <stdint.h>
  42. // {hash_length}-character version of the tokenizer hash function.
  43. //
  44. // The argument must be a string literal. It is concatenated with "" to ensure
  45. // that this is the case.
  46. //
  47. // clang-format off
  48. """
  49. def generate_pw_tokenizer_65599_fixed_length_hash_macro(hash_length):
  50. """Generate macro that hashes a string literal using a modified x65599 hash.
  51. The macros generated by this function only operate on string literals.
  52. Since macros can only operate on fixed-length strings, the hash macro only
  53. hashes up to a fixed length, and characters beyond that length are ignored.
  54. To eliminate some collisions, the length of the string is hashed as if it
  55. were the first character.
  56. This hash is calculated with the following equation, where s is the string
  57. and k is the maximum hash length:
  58. H(s, k) = len(s) + 65599 * s[0] + 65599^2 * s[1] + ... + 65599^k * s[k-1]
  59. The hash algorithm is a modified version of the x65599 hash used by the SDBM
  60. open source project. This hash has the following differences from x65599:
  61. - Characters are only hashed up to a fixed maximum string length.
  62. - Characters are hashed in reverse order.
  63. - The string length is hashed as the first character in the string.
  64. The code generated by this function is intentionally sparse. Each character
  65. appears hash_length times per log message, so using fewer characters results
  66. in faster compilation times.
  67. Args:
  68. hash_length: maximum string size to hash; extra characters are ignored
  69. Returns:
  70. the macro header file as a string
  71. """
  72. first_hash_term = ('(uint32_t)(sizeof(str "") - 1 + '
  73. '/* The argument must be a string literal. */ \\\n')
  74. # Use this to add the aligned backslash at the end of the macro lines.
  75. line_format = '{{:<{}}}\\\n'.format(len(first_hash_term))
  76. lines = [
  77. FILE_HEADER.format(script=os.path.basename(__file__),
  78. hash_length=hash_length,
  79. year=datetime.date.today().year)
  80. ]
  81. lines.append(
  82. line_format.format('#define {}_{}_HASH(str)'.format(
  83. HASH_NAME.upper(), hash_length)))
  84. lines.append(' ' + first_hash_term) # add indendation and the macro line
  85. indent = ' ' * len(' (uint32_t)(')
  86. coefficient_format = '0x{coefficient:0>8x}u'
  87. # The string will have at least a null terminator
  88. lines.append(
  89. line_format.format('{}0x{:0>8x}u * (uint8_t)str[0] +'.format(
  90. indent, HASH_CONSTANT)))
  91. # Format string to use for the remaining terms.
  92. term_format = (
  93. '{indent}{coefficient} * '
  94. '(uint8_t)({index} < sizeof(str) ? str[{index}] : 0) +').format(
  95. indent=indent,
  96. coefficient=coefficient_format,
  97. index='{{index:>{}}}'.format(len(str(hash_length - 1))))
  98. for i in range(1, hash_length):
  99. coefficient = HASH_CONSTANT**(i + 1) % 2**32
  100. term = term_format.format(index=i, coefficient=coefficient)
  101. lines.append(line_format.format(term))
  102. # Remove the extra + and \ and add the closing )
  103. lines[-1] = lines[-1].rstrip(' +\\\n') + ')'
  104. lines.append('\n\n// clang-format on\n')
  105. return ''.join(lines)
  106. def _main():
  107. base = os.path.abspath(
  108. os.path.join(os.path.dirname(__file__), '..', 'public', 'pw_tokenizer',
  109. 'internal'))
  110. # Generate macros for hashes of the specified lengths.
  111. for hash_length in HASH_LENGTHS:
  112. path = os.path.join(
  113. base, '{}_{}_hash_macro.h'.format(HASH_NAME, hash_length))
  114. with open(path, 'w') as header_file:
  115. header_file.write(
  116. generate_pw_tokenizer_65599_fixed_length_hash_macro(
  117. hash_length))
  118. print('Generated {}-character hash macro at {}'.format(
  119. hash_length, path))
  120. if __name__ == '__main__':
  121. _main()