make_basic.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. from decimal import Decimal
  2. import pyarrow as pa
  3. import pyarrow.parquet as pq
  4. import pandas as pd
  5. """
  6. Script to generate the basic.parquet file used by acceptance tests
  7. """
  8. filename = "basic.parquet"
  9. df = pd.DataFrame(
  10. {
  11. "customer_id": ["12345", "23456", "34567"],
  12. "customerId": [12345, 23456, 34567],
  13. "customer_id_decimal": [
  14. Decimal("123.450"),
  15. Decimal("234.560"),
  16. Decimal("345.670"),
  17. ],
  18. "user_info": [
  19. {
  20. "personal_information": {
  21. "email": "12345@test.com",
  22. "first_name": "John",
  23. "last_name": "Doe",
  24. }
  25. },
  26. {
  27. "personal_information": {
  28. "email": "23456@test.com",
  29. "first_name": "Jane",
  30. "last_name": "Doe",
  31. }
  32. },
  33. {
  34. "personal_information": {
  35. "email": "34567@test.com",
  36. "first_name": "Mary",
  37. "last_name": "Smith",
  38. }
  39. },
  40. ],
  41. "days_off": [
  42. ["2020-01-01", "2020-01-02"],
  43. ["2020-01-01", "2020-01-07"],
  44. ["2020-01-05"],
  45. ],
  46. }
  47. )
  48. table = pa.Table.from_pandas(df)
  49. pq.write_table(table, filename)
  50. parquet_file = pq.ParquetFile(filename)
  51. schema = parquet_file.metadata.schema.to_arrow_schema().remove_metadata()
  52. table2 = parquet_file.read()
  53. df = table2.to_pandas()
  54. print("File written. Data:\n\n{}".format(df))