to_parquet.py 932 B

123456789101112131415161718192021222324252627282930313233
  1. #!/bin/env/python
  2. import pandas as pd
  3. import numpy as np
  4. import json
  5. from datetime import datetime
  6. from zat.log_to_dataframe import LogToDataFrame
  7. import glob
  8. import os
  9. import sys
  10. logFile = sys.argv[1]
  11. baseName = sys.argv[2]
  12. s3Bucket = sys.argv[3]
  13. # Zat hangs if the file is not in ascii or if it is a conn-summary file
  14. if baseName == 'conn-summary':
  15. print("conn-summary detected, exiting")
  16. sys.exit()
  17. current_date = datetime.now()
  18. dateDay = current_date.strftime('%d')
  19. dateMonth = current_date.strftime('%m')
  20. dateYear = current_date.strftime('%Y')
  21. dateStr = current_date.strftime('%Y-%m-%d-%H')
  22. print(logFile)
  23. log_to_df = LogToDataFrame()
  24. df = log_to_df.create_dataframe(logFile)
  25. # add a type to make analysis easier in combined dataframes
  26. df['type']=baseName
  27. df.to_parquet(s3Bucket + dateYear + '/' + dateMonth + '/' + dateDay + '/' + baseName + '-' + dateStr + '.parquet',engine='fastparquet', compression='gzip')