main.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. from cassandra_rw import CassandraReadWriteDb
  2. from TxInfo import TxInfoModel
  3. from ml_model import BuildMlPipeline
  4. from sklearn.model_selection import train_test_split
  5. if __name__ == '__main__':
  6. cass_rw = CassandraReadWriteDb(ip_addrs=['172.17.0.2'], keyspace="emp")
  7. #Load data in cassandra from csv files
  8. cass_rw.sync_class_table(TxInfoModel)
  9. cass_rw.write_file_table('creditcard.csv')
  10. #Load cassandra data into pandas
  11. credit_data = cass_rw.get_pandas_from_cassandra()
  12. print ('Data loaded into dataframe')
  13. #Create models
  14. ml_pipeline = BuildMlPipeline()
  15. ml_pipeline.set_estimators('sgdClassifier','randomForestClassifier')
  16. ml_pipeline.set_scalers('standardscaler')
  17. ml_pipeline.set_samplers('smote','smoteenn')
  18. ml_pipeline.create_pipelines()
  19. #Hyperparameter Configuration
  20. params_dict = {}
  21. params_dict['smote'] = {'smote__k_neighbors':[5,10,15]}
  22. params_dict['smoteenn'] = {'smoteenn__sampling_strategy':['auto','all','not majority']}
  23. params_dict['randomforestclassifier'] = {'randomforestclassifier__n_estimators':[8,12]}
  24. params_dict['svc'] = {'svc__kernel':['linear','rbf','poly'],'svc__C':[.1,1,10]}
  25. ml_pipeline.set_hyperparameters(params_dict)
  26. #credit_data = credit_data.sample(10000)
  27. X = credit_data.drop(['tx_id','Time','C'],axis=1)
  28. y = credit_data.C
  29. trainX, testX, trainY, testY = train_test_split(X,y)
  30. print ('Model Training')
  31. #model training
  32. ml_pipeline.fit(trainX,trainY)
  33. #Calculating model performance
  34. ml_pipeline.score(testX,testY)