12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- def psdnmyz_2():
- # load TWO csv to be sent to be pseudonymz
- # metrics_df=pd.read_csv('/home/arasan/testrep/psmd/jureca/TOTAL_METRICS_Skel_header.csv')
- seg_df = pd.read_csv('/home/arasan/testrep/psmd/jureca/psmd_seg_vols.csv')
- # add rnadom id column to both df
- # below line is a disaster
- # metrics_df['RNDNAME'] = metrics_df['NAME'].apply(lambda x: gocept.pseudonymize.integer(x, 'secret'))
- # seg_df['RNDNAME'] = seg_df['NAME'].apply(lambda x: gocept.pseudonymize.integer(x, 'secret'))
- # a=np.random.randint(100000,999999,metrics_df.NAME.values.size)
- # metrics_df['RNDNAME']=a
- # print 'after rqndom id has been added'
- # flagg=True
- # while(flagg):
- # try:
- # print pd.concat(g for _, g in metrics_df.groupby("RNDNAME") if len(g) > 1)
- # except ValueError:
- # print 'NO DUPLICAtes'
- # metrics_df.to_csv('/home/arasan/testrep/psmd/jureca/TOTAL_rnd_temp.csv')
- # flagg=False
- # else:
- # print 'DUPES'
- # metrics_df=metrics_df.drop('RNDNAME', axis=1)
- # a=np.random.randint(100000,999999,metrics_df.NAME.values.size)
- # metrics_df['RNDNAME']=a
- # load double chekced randomeized df 1) above try catch 2) using np unique
- metrnd = pd.read_csv('/home/arasan/testrep/psmd/jureca/TOTAL_rnd_temp.csv')
- seg_df['SNO'] = seg_df.index + 1
- metrnd['SNO'] = seg_df.index + 1
- # add RNDAME column to seg_df
- seg_df['RNDNAME'] = metrnd.RNDNAME.values
- # rename columns NANME to ID and RNDNAME to NAME
- seg_df = seg_df.rename(index=str, columns={"NAME": "ID"})
- seg_df = seg_df.rename(index=str, columns={"RNDNAME": "NAME"})
- metrnd = metrnd.rename(index=str, columns={"NAME": "ID"})
- metrnd = metrnd.rename(index=str, columns={"RNDNAME": "NAME"})
- # dump map out with 3 columns ID,NAME,SNO
- mapdf = metrnd[['ID', 'NAME', 'SNO']]
- mapdf.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/psdnmyz_map.csv', index=False)
- # drop ID and SNO
- seg_df = seg_df.drop(['ID', 'SNO'], axis=1)
- metrnd = metrnd.drop(['ID', 'SNO'], axis=1)
- # move NAME column to first position
- metrnd = metrnd[['NAME', 'mean_skel_MD_LH_RH', 'sd_skel_MD_LH_RH', 'Pw90S_skel_MD_LH_RH', 'mean_skel_FA_LH_RH',
- 'sd_skel_FA_LH_RH', 'mean_skel_AD_LH_RH', 'sd_skel_AD_LH_RH', 'mean_skel_RD_LH_RH',
- 'sd_skel_RD_LH_RH']]
- seg_df = seg_df[['NAME', 'AGE', 'SEX', 'GMV', 'WMV', 'CSFV', 'ICV']]
- # if pd.concat(g for _, g in metrics_df.groupby("RNDNAME") if len(g) > 1).RNDNAME.values.size:
- # print 'NOT OK'
- # else:
- # print 'OK'
- metrnd.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/TOTAL_METRICS_Skel_header.csv', index=False)
- seg_df.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/psmd_seg_vols.csv', index=False)
|