main_37.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. def psdnmyz_2():
  2. # load TWO csv to be sent to be pseudonymz
  3. # metrics_df=pd.read_csv('/home/arasan/testrep/psmd/jureca/TOTAL_METRICS_Skel_header.csv')
  4. seg_df = pd.read_csv('/home/arasan/testrep/psmd/jureca/psmd_seg_vols.csv')
  5. # add rnadom id column to both df
  6. # below line is a disaster
  7. # metrics_df['RNDNAME'] = metrics_df['NAME'].apply(lambda x: gocept.pseudonymize.integer(x, 'secret'))
  8. # seg_df['RNDNAME'] = seg_df['NAME'].apply(lambda x: gocept.pseudonymize.integer(x, 'secret'))
  9. # a=np.random.randint(100000,999999,metrics_df.NAME.values.size)
  10. # metrics_df['RNDNAME']=a
  11. # print 'after rqndom id has been added'
  12. # flagg=True
  13. # while(flagg):
  14. # try:
  15. # print pd.concat(g for _, g in metrics_df.groupby("RNDNAME") if len(g) > 1)
  16. # except ValueError:
  17. # print 'NO DUPLICAtes'
  18. # metrics_df.to_csv('/home/arasan/testrep/psmd/jureca/TOTAL_rnd_temp.csv')
  19. # flagg=False
  20. # else:
  21. # print 'DUPES'
  22. # metrics_df=metrics_df.drop('RNDNAME', axis=1)
  23. # a=np.random.randint(100000,999999,metrics_df.NAME.values.size)
  24. # metrics_df['RNDNAME']=a
  25. # load double chekced randomeized df 1) above try catch 2) using np unique
  26. metrnd = pd.read_csv('/home/arasan/testrep/psmd/jureca/TOTAL_rnd_temp.csv')
  27. seg_df['SNO'] = seg_df.index + 1
  28. metrnd['SNO'] = seg_df.index + 1
  29. # add RNDAME column to seg_df
  30. seg_df['RNDNAME'] = metrnd.RNDNAME.values
  31. # rename columns NANME to ID and RNDNAME to NAME
  32. seg_df = seg_df.rename(index=str, columns={"NAME": "ID"})
  33. seg_df = seg_df.rename(index=str, columns={"RNDNAME": "NAME"})
  34. metrnd = metrnd.rename(index=str, columns={"NAME": "ID"})
  35. metrnd = metrnd.rename(index=str, columns={"RNDNAME": "NAME"})
  36. # dump map out with 3 columns ID,NAME,SNO
  37. mapdf = metrnd[['ID', 'NAME', 'SNO']]
  38. mapdf.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/psdnmyz_map.csv', index=False)
  39. # drop ID and SNO
  40. seg_df = seg_df.drop(['ID', 'SNO'], axis=1)
  41. metrnd = metrnd.drop(['ID', 'SNO'], axis=1)
  42. # move NAME column to first position
  43. metrnd = metrnd[['NAME', 'mean_skel_MD_LH_RH', 'sd_skel_MD_LH_RH', 'Pw90S_skel_MD_LH_RH', 'mean_skel_FA_LH_RH',
  44. 'sd_skel_FA_LH_RH', 'mean_skel_AD_LH_RH', 'sd_skel_AD_LH_RH', 'mean_skel_RD_LH_RH',
  45. 'sd_skel_RD_LH_RH']]
  46. seg_df = seg_df[['NAME', 'AGE', 'SEX', 'GMV', 'WMV', 'CSFV', 'ICV']]
  47. # if pd.concat(g for _, g in metrics_df.groupby("RNDNAME") if len(g) > 1).RNDNAME.values.size:
  48. # print 'NOT OK'
  49. # else:
  50. # print 'OK'
  51. metrnd.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/TOTAL_METRICS_Skel_header.csv', index=False)
  52. seg_df.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/psmd_seg_vols.csv', index=False)