ml_model.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. from sklearn.model_selection import GridSearchCV
  2. from sklearn.metrics import confusion_matrix
  3. from sklearn.ensemble import RandomForestClassifier
  4. from sklearn.linear_model import SGDClassifier
  5. from sklearn.svm import SVC
  6. from sklearn.preprocessing import StandardScaler, MinMaxScaler
  7. from joblib import dump, load
  8. from imblearn.pipeline import make_pipeline
  9. from imblearn.over_sampling import SMOTE
  10. from imblearn.combine import SMOTEENN
  11. class BuildMlPipeline:
  12. def __init__(self):
  13. pass
  14. def set_estimators(self, *args):
  15. estimator_db = {
  16. 'randomForestClassifier': RandomForestClassifier(),
  17. 'svc': SVC(),
  18. 'sgdClassifier': SGDClassifier(),
  19. }
  20. self.estimators = list(map( lambda algo: estimator_db[algo],args))
  21. def set_scalers(self, *args):
  22. scaler_db = {
  23. 'standardscaler':StandardScaler(),
  24. 'minmaxscaler':MinMaxScaler(),
  25. }
  26. self.scalers = list(map( lambda scaler: scaler_db[scaler],args))
  27. def set_samplers(self, *args):
  28. sampler_db = {
  29. 'smote':SMOTE(),
  30. 'smoteenn':SMOTEENN(),
  31. }
  32. self.samplers = list(map( lambda sampler: sampler_db[sampler],args))
  33. def set_hyperparameters(self, params):
  34. self.hyperparameters = params
  35. def create_pipelines(self):
  36. self.model_pipelines = []
  37. for estimator in self.estimators:
  38. for sampler in self.samplers:
  39. for scaler in self.scalers:
  40. pipeline = make_pipeline(scaler, sampler, estimator)
  41. self.model_pipelines.append(pipeline)
  42. def fit(self, trainX, trainY):
  43. self.gs_pipelines = []
  44. for idx,pipeline in enumerate(self.model_pipelines):
  45. elems = list(map(lambda x:x[0] ,pipeline.steps))
  46. param_grid = {}
  47. for elem in elems:
  48. if elem in self.hyperparameters:
  49. param_grid.update(self.hyperparameters[elem])
  50. gs = GridSearchCV(pipeline, param_grid= param_grid, n_jobs=-1, cv=5)
  51. gs.fit(trainX, trainY)
  52. #dump(gs, 'model'+idx+'.pipeline')
  53. self.gs_pipelines.append(gs)
  54. def score(self, testX, testY):
  55. for idx,model in enumerate(self.gs_pipelines):
  56. y_pred = model.best_estimator_.predict(testX)
  57. print (model.best_estimator_)
  58. print (idx,confusion_matrix(y_true=testY,y_pred=y_pred))
  59. import pandas as pd
  60. from sklearn.model_selection import train_test_split
  61. if __name__ == '__main__':
  62. ml_pipeline = BuildMlPipeline()
  63. ml_pipeline.set_estimators('randomForestClassifier')
  64. ml_pipeline.set_scalers('standardscaler')
  65. ml_pipeline.set_samplers('smote','smoteenn')
  66. ml_pipeline.create_pipelines()
  67. print (ml_pipeline.model_pipelines)
  68. params_dict = {}
  69. params_dict['smote'] = {'smote__k_neighbors':[5,10,15]}
  70. params_dict['smoteenn'] = {'smoteenn__sampling_strategy':['auto','all','not majority']}
  71. params_dict['randomforestclassifier'] = {'randomforestclassifier__n_estimators':[8,12]}
  72. params_dict['svc'] = {'svc__kernel':['linear','rbf','poly'],'svc__C':[.1,1,10]}
  73. ml_pipeline.set_hyperparameters(params_dict)
  74. credit_data = pd.read_csv('creditcard.csv').sample(20000)
  75. X = credit_data.drop(['Time','C'],axis=1)
  76. y = credit_data.C
  77. trainX, testX, trainY, testY = train_test_split(X,y)
  78. ml_pipeline.fit(X,y)
  79. ml_pipeline.score(testX,testY)