trunc.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. import sys
  2. import re
  3. MAX_LEN = int(sys.argv[2])
  4. PREFIX_LENGTH = 8
  5. avian = (
  6. 'african_starling',
  7. 'american_black_duck',
  8. 'american_green_winged_teal',
  9. 'american_widgeon',
  10. 'australian_shelduck',
  11. 'bar_headed_goose',
  12. 'black_billed_magpie',
  13. 'black_crowned_night_heron',
  14. 'black_headed_gull',
  15. 'black_necked_grebe',
  16. 'blue_winged_teal',
  17. 'brown_headed_gull',
  18. 'domestic_mallard_duck',
  19. 'double_crested_cormorant',
  20. 'golden_mountain_thrush',
  21. 'great_crested_grebe',
  22. 'greater_white_fronted_goose',
  23. 'green_winged_teal',
  24. 'korean_native_chicken',
  25. 'mountain_hawk_eagle',
  26. 'northern_pintail',
  27. 'northern_shoveler',
  28. 'open_billed_stork',
  29. 'pacific_black_duck',
  30. 'peregrine_falcon',
  31. 'pink_footed_goose',
  32. 'red_crested_pochard',
  33. 'red_necked_stint',
  34. 'red_winged_tinamou',
  35. 'ring_billed_gull',
  36. 'ring_necked_duck',
  37. 'rosy_billed_pochard',
  38. 'rufous_necked_stint',
  39. 'scaly_breasted_munia',
  40. 'semi_palmated_sandpiper',
  41. 'sharp_tailed_sandpiper',
  42. 'slaty_backed_gull',
  43. 'spot_billed_duck',
  44. 'wedge_tailed_shearwater',
  45. 'white_backed_munia',
  46. 'whitefronted_goose',
  47. 'white_front_goose',
  48. 'white_winged_scoter',
  49. 'yellow_headed_amazon',
  50. 'ruddy_turnstone',
  51. )
  52. trans = (('environment', 'env'),
  53. ('california', 'CA'),
  54. ('thailand', 'thai'),
  55. ('guangxi_luochengmulaozuzizhi', 'china'),
  56. ('north_|south_|east_|west_|interior_|central_|western_', ''),
  57. ('>a/ft_benning/wrair1669p/2009_h1n1_a/ft.benning/wrair1669p/2009_h1n1__', '>a/ft_benning/wrair1669p/2009_h1n1_'),
  58. ('>a/aa/huston/1945__|_a_/_h1n1__', '>a/aa/huston/1945_h1n1__'),
  59. )
  60. def smart_truncate(line):
  61. line = line.lower()
  62. for pattern, replacement in trans:
  63. line = re.sub(pattern, replacement, line)
  64. pipe_parts = line.split('|')
  65. if len(pipe_parts) > 1:
  66. line = pipe_parts[0] + '|A'
  67. for x in line.split('/'):
  68. if x in avian:
  69. line = re.sub(x, 'avian', line)
  70. double_start = line.rfind(line[1:PREFIX_LENGTH], PREFIX_LENGTH)
  71. if double_start != -1:
  72. line = line[double_start:]
  73. return line.strip().title() + '\r\n'
  74. def main():
  75. with open(sys.argv[1], 'r') as f:
  76. for line in f:
  77. if line.startswith('>'):
  78. line = smart_truncate(line)
  79. print (line),
  80. if __name__ == '__main__':
  81. sys.exit(main())