main_38.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. def psdnmyz_3():
  2. # load TWO csv to be sent to be pseudonymz
  3. # metrics_df=pd.read_csv('/home/arasan/testrep/psmd/jureca/TOTAL_METRICS_Skel_header.csv')
  4. seg_df = pd.read_csv('/home/arasan/testrep/psmd/jureca/psmd_seg2_vols.csv')
  5. # add rnadom id column to both df
  6. # below line is a disaster
  7. # metrics_df['RNDNAME'] = metrics_df['NAME'].apply(lambda x: gocept.pseudonymize.integer(x, 'secret'))
  8. # seg_df['RNDNAME'] = seg_df['NAME'].apply(lambda x: gocept.pseudonymize.integer(x, 'secret'))
  9. # a=np.random.randint(100000,999999,metrics_df.NAME.values.size)
  10. # metrics_df['RNDNAME']=a
  11. # print 'after rqndom id has been added'
  12. # flagg=True
  13. # while(flagg):
  14. # try:
  15. # print pd.concat(g for _, g in metrics_df.groupby("RNDNAME") if len(g) > 1)
  16. # except ValueError:
  17. # print 'NO DUPLICAtes'
  18. # metrics_df.to_csv('/home/arasan/testrep/psmd/jureca/TOTAL_rnd_temp.csv')
  19. # flagg=False
  20. # else:
  21. # print 'DUPES'
  22. # metrics_df=metrics_df.drop('RNDNAME', axis=1)
  23. # a=np.random.randint(100000,999999,metrics_df.NAME.values.size)
  24. # metrics_df['RNDNAME']=a
  25. # load double chekced randomeized df 1) above try catch 2) using np unique
  26. metrnd = pd.read_csv('/home/arasan/testrep/psmd/jureca/TOTAL_rnd_temp.csv')
  27. seg_df['SNO'] = seg_df.index + 1
  28. # metrnd['SNO']=seg_df.index+1
  29. # add RNDAME column to seg_df
  30. seg_df['RNDNAME'] = metrnd.RNDNAME.values
  31. # rename columns NANME to ID and RNDNAME to NAME
  32. # seg_df=seg_df.rename(index=str, columns={"NAME": "ID"})
  33. seg_df = seg_df.rename(index=str, columns={"RNDNAME": "NAME"})
  34. # metrnd=metrnd.rename(index=str, columns={"NAME": "ID"})
  35. # metrnd=metrnd.rename(index=str, columns={"RNDNAME": "NAME"})
  36. # dump map out with 3 columns ID,NAME,SNO
  37. # mapdf=metrnd[['ID','NAME','SNO']]
  38. # mapdf.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/psdnmyz_map.csv',index=False)
  39. # drop ID and SNO
  40. seg_df = seg_df.drop(['ID', 'SNO'], axis=1)
  41. # metrnd=metrnd.drop(['ID','SNO'],axis=1)
  42. # move NAME column to first position
  43. # metrnd=metrnd[['NAME','mean_skel_MD_LH_RH','sd_skel_MD_LH_RH','Pw90S_skel_MD_LH_RH','mean_skel_FA_LH_RH','sd_skel_FA_LH_RH','mean_skel_AD_LH_RH','sd_skel_AD_LH_RH','mean_skel_RD_LH_RH','sd_skel_RD_LH_RH']]
  44. seg_df = seg_df[['NAME', 'AGE', 'SEX', 'ICV']]
  45. # if pd.concat(g for _, g in metrics_df.groupby("RNDNAME") if len(g) > 1).RNDNAME.values.size:
  46. # print 'NOT OK'
  47. # else:
  48. # print 'OK'
  49. # metrnd.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/TOTAL_METRICS_Skel_header.csv',index=False)
  50. seg_df.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet3/psmd_seg2_vols.csv', index=False)