15-extract-censo.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. import requests as rq
  2. from zipfile import ZipFile
  3. from io import BytesIO
  4. import logging
  5. from requests.exceptions import URLRequired
  6. import boto3
  7. from botocore.exceptions import ClientError
  8. import os
  9. import argparse
  10. class Extract():
  11. def __init__(self, url, bucket_name=''):
  12. self.url = url
  13. self.bucket_name = bucket_name
  14. def download(self):
  15. self.arq = rq.get(self.url, stream=True)
  16. print(self.arq.status_code)
  17. return self.arq
  18. def extract(self, arq):
  19. z = ZipFile(BytesIO(arq.content))
  20. z.extractall(path='./tmp')
  21. def upload_file(self, object_name=None):
  22. # If S3 object_name was not specified, use file_name
  23. if object_name is None:
  24. object_name = os.path.basename(self.file_name)
  25. # Upload the file
  26. s3 = boto3.client('s3', region_name='us-east-2',
  27. aws_access_key_id=os.getenv('aws_access_key_id'), aws_secret_access_key=os.getenv('aws_secret_access_key'))
  28. try:
  29. with open('raw/' + self.file_name, "rb") as f:
  30. response = s3.upload_fileobj(f, self.bucket_name, object_name)
  31. except ClientError as e:
  32. logging.error(e)
  33. return False
  34. return True
  35. def run(self):
  36. arq = self.download()
  37. self.extract(arq)
  38. if __name__=='__main__':
  39. parser = argparse.ArgumentParser()
  40. parser.add_argument('--url', dest='url', default='https://download.inep.gov.br/microdados/microdados_educacao_superior_2019.zip',
  41. help='URL para download do arquivo')
  42. parser.add_argument('--bucket_name', dest='bucket_name', default='igti-edc-desafiofinal',
  43. help='S3 Bucket Name')
  44. parser.add_argument('--file_path', dest='file_path', default='./tmp/DADOS/ESCOLAS/ESCOLA_95.TXT',
  45. help='File path for upload to S3')
  46. args = parser.parse_args()
  47. etl = Extract(url=args.url, bucket_name=args.bucket_name)
  48. etl.run()
  49. s3_client = boto3.client('s3', region_name='us-east-2',
  50. aws_access_key_id=os.getenv('aws_access_key_id'),
  51. aws_secret_access_key=os.getenv('aws_secret_access_key'))
  52. s3_upload_cfg = boto3.s3.transfer.TransferConfig()
  53. pasta = os.listdir('./tmp')
  54. print("Upload ALUNO")
  55. s3_client.upload_file(f"./tmp/{pasta[0]}/dados/SUP_ALUNO_2019.CSV",
  56. "igti-edc-desafiofinal",
  57. "raw/censoedsup2019/aluno/SUP_ALUNO_2019.CSV",
  58. Config=s3_upload_cfg
  59. )
  60. print("Upload DOCENTE")
  61. s3_client.upload_file(f"./tmp/{pasta[0]}/dados/SUP_DOCENTE_2019.CSV",
  62. "igti-edc-desafiofinal",
  63. "raw/censoedsup2019/docente/SUP_DOCENTE_2019.CSV",
  64. Config=s3_upload_cfg
  65. )
  66. print("Upload CURSO")
  67. s3_client.upload_file(f"./tmp/{pasta[0]}/dados/SUP_CURSO_2019.CSV",
  68. "igti-edc-desafiofinal",
  69. "raw/censoedsup2019/curso/SUP_CURSO_2019.CSV",
  70. Config=s3_upload_cfg
  71. )
  72. print("Upload IES")
  73. s3_client.upload_file(f"./tmp/{pasta[0]}/dados/SUP_IES_2019.CSV",
  74. "igti-edc-desafiofinal",
  75. "raw/censoedsup2019/IES/SUP_IES_2019.CSV",
  76. Config=s3_upload_cfg
  77. )