json-extractor.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. #!/usr/bin/env python3
  2. """
  3. json-extractor.py, a simple command line tool for creating csv files from
  4. large json datasets.
  5. Copyright (C) 2014 Ryan Chartier
  6. This program is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. This program is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. """
  17. from datetime import datetime
  18. import json
  19. import os
  20. import re
  21. import argparse
  22. import csv
  23. import copy
  24. import sys
  25. import gzip
  26. strptime = datetime.strptime
  27. class attriObject:
  28. """Class object for attribute parser."""
  29. def __init__(self, string, na):
  30. self.raw = string
  31. self.value = re.split(":", string)
  32. self.title = self.value[-1]
  33. self.na = na
  34. def getElement(self, json_object):
  35. found = [json_object]
  36. for entry in self.value:
  37. for index in range(len(found)):
  38. try:
  39. found[index] = found[index][entry]
  40. except (TypeError, KeyError):
  41. if self.na:
  42. return "NA"
  43. print("'{}' is not a valid json entry.".format(self.raw))
  44. sys.exit()
  45. #If single search object is a list, search entire list. Error if nested lists.
  46. if isinstance(found[index], list):
  47. if len(found) > 1:
  48. raise Exception("Extractor currently does not handle nested lists.")
  49. found = found[index]
  50. if len(found) == 0:
  51. return "NA"
  52. elif len(found) == 1:
  53. return found[0]
  54. else:
  55. return ";".join(found)
  56. def json_entries(args):
  57. """Iterates over entries in path."""
  58. for filename in os.listdir(args.path):
  59. if re.match(args.string, filename) and ".json" in filename:
  60. f = gzip.open if filename.endswith(".gz") else open
  61. print("parsing", filename)
  62. with f(args.path + filename, 'rb') as data_file:
  63. for line in data_file:
  64. try:
  65. json_object = json.loads(line.decode(args.encoding))
  66. except ValueError:
  67. print("Error in", filename, "entry incomplete.")
  68. continue
  69. if isinstance(json_object, list):
  70. for jobject in json_object:
  71. yield jobject
  72. else:
  73. yield json_object
  74. def parse(args):
  75. with open(args.output, 'w+', encoding="utf-8") as output:
  76. print("Opened", args.output)
  77. if not args.compress:
  78. csv_writer = csv.writer(output, dialect=args.dialect)
  79. if not args.nolabel:
  80. csv_writer.writerow([a.title for a in args.attributes])
  81. count = 0
  82. tweets = set()
  83. for json_object in json_entries(args):
  84. #Check for duplicates
  85. if args.id:
  86. identity = args.id.getElement(json_object)
  87. if identity in tweets:
  88. continue
  89. tweets.add(identity)
  90. #Check for time restrictions.
  91. if args.start or args.end:
  92. tweet_time = strptime(args.date.getElement(json_object), args.dateformat)
  93. if args.start and args.start > tweet_time:
  94. continue
  95. if args.end and args.end < tweet_time:
  96. continue
  97. #Check for hashtag.
  98. if args.hashtag:
  99. for entity in json_object['entities']["hashtags"]:
  100. if entity['text'].lower() == args.hashtag:
  101. break
  102. else:
  103. continue
  104. #compression algorithm.
  105. if args.compress:
  106. json.dump(json_object, output)
  107. output.write("\n")
  108. #Write this tweet to csv.
  109. else:
  110. item = [i.getElement(json_object) for i in args.attributes]
  111. csv_writer.writerow(item)
  112. count += 1
  113. print("Recorded {} items.".format(count))
  114. if tweets:
  115. print("largest id:", max(tweets))
  116. if __name__ == "__main__":
  117. parser = argparse.ArgumentParser(description='Extracts attributes from tweets.')
  118. parser.add_argument("attributes", nargs='*', help="Attributes to search for. Attributes inside nested inside other attributes should be seperated by a colon. Example: user:screen_name, entities:hashtags:text.")
  119. parser.add_argument("-string", default="", help="Regular expression for files to parse. Defaults to empty string.")
  120. parser.add_argument("-path", default="./", help="Optional path to folder containing tweets. Defaults to current folder.")
  121. parser.add_argument("-id", default="", help="Defines what entry should be used as the element id. Defaults to no id duplicate checking.")
  122. parser.add_argument("-na", action="store_true", help="Insert NA into absent entries instead of error.")
  123. parser.add_argument("-nolabel", action="store_true", help="Prevents writting column headers to csv file.")
  124. parser.add_argument("-compress", action="store_true", help="Compress json archives into single file. Ignores csv column choices.")
  125. parser.add_argument("-output", default="output", help="Optional file to output results. Defaults to output.")
  126. parser.add_argument("-dialect", default="excel", help="Sets dialect for csv output. Defaults to excel. See python module csv.list_dialects()")
  127. parser.add_argument("-encoding", default="utf-8", help="Sets character encoding for json files. Defaults to 'utf-8'.")
  128. parser.add_argument("-date", default="created_at", help="Define where to find date of entry.")
  129. parser.add_argument("-dateformat", default='%a %b %d %H:%M:%S +0000 %Y', help="Define format that dates are given.")
  130. parser.add_argument("-start", default="", help="Define start date for tweets. Format (dd:mm:yyyy)")
  131. parser.add_argument("-end", default="", help="Define end date for tweets. Format (dd:mm:yyyy)")
  132. parser.add_argument("-hashtag", default="", help="Define a hashtag that must be in parsed tweets.")
  133. args = parser.parse_args()
  134. if args.compress:
  135. args.output += ".json"
  136. else:
  137. args.output += ".csv"
  138. if not args.path.endswith("/"):
  139. args.path += "/"
  140. if args.id:
  141. args.id = attriObject(args.id, args.na)
  142. args.date = attriObject(args.date, args.na)
  143. args.attributes = [attriObject(i, args.na) for i in args.attributes]
  144. args.string = re.compile(args.string)
  145. #Tweet specific restrictions.
  146. args.start = strptime(args.start, '%d:%m:%Y') if args.start else False
  147. args.end = strptime(args.end, '%d:%m:%Y') if args.end else False
  148. args.hashtag = args.hashtag.lower()
  149. parse(args)