123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- from sklearn.model_selection import GridSearchCV
- from sklearn.metrics import confusion_matrix
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.linear_model import SGDClassifier
- from sklearn.svm import SVC
- from sklearn.preprocessing import StandardScaler, MinMaxScaler
- from joblib import dump, load
- from imblearn.pipeline import make_pipeline
- from imblearn.over_sampling import SMOTE
- from imblearn.combine import SMOTEENN
- class BuildMlPipeline:
-
- def __init__(self):
- pass
-
- def set_estimators(self, *args):
- estimator_db = {
- 'randomForestClassifier': RandomForestClassifier(),
- 'svc': SVC(),
- 'sgdClassifier': SGDClassifier(),
- }
- self.estimators = list(map( lambda algo: estimator_db[algo],args))
-
- def set_scalers(self, *args):
- scaler_db = {
- 'standardscaler':StandardScaler(),
- 'minmaxscaler':MinMaxScaler(),
- }
- self.scalers = list(map( lambda scaler: scaler_db[scaler],args))
-
- def set_samplers(self, *args):
- sampler_db = {
- 'smote':SMOTE(),
- 'smoteenn':SMOTEENN(),
- }
- self.samplers = list(map( lambda sampler: sampler_db[sampler],args))
-
- def set_hyperparameters(self, params):
- self.hyperparameters = params
-
- def create_pipelines(self):
- self.model_pipelines = []
- for estimator in self.estimators:
- for sampler in self.samplers:
- for scaler in self.scalers:
- pipeline = make_pipeline(scaler, sampler, estimator)
- self.model_pipelines.append(pipeline)
-
-
- def fit(self, trainX, trainY):
- self.gs_pipelines = []
- for idx,pipeline in enumerate(self.model_pipelines):
- elems = list(map(lambda x:x[0] ,pipeline.steps))
- param_grid = {}
- for elem in elems:
- if elem in self.hyperparameters:
- param_grid.update(self.hyperparameters[elem])
-
- gs = GridSearchCV(pipeline, param_grid= param_grid, n_jobs=-1, cv=5)
- gs.fit(trainX, trainY)
- #dump(gs, 'model'+idx+'.pipeline')
- self.gs_pipelines.append(gs)
-
-
- def score(self, testX, testY):
- for idx,model in enumerate(self.gs_pipelines):
- y_pred = model.best_estimator_.predict(testX)
- print (model.best_estimator_)
- print (idx,confusion_matrix(y_true=testY,y_pred=y_pred))
- import pandas as pd
- from sklearn.model_selection import train_test_split
- if __name__ == '__main__':
- ml_pipeline = BuildMlPipeline()
- ml_pipeline.set_estimators('randomForestClassifier')
- ml_pipeline.set_scalers('standardscaler')
- ml_pipeline.set_samplers('smote','smoteenn')
- ml_pipeline.create_pipelines()
-
- print (ml_pipeline.model_pipelines)
-
- params_dict = {}
- params_dict['smote'] = {'smote__k_neighbors':[5,10,15]}
- params_dict['smoteenn'] = {'smoteenn__sampling_strategy':['auto','all','not majority']}
- params_dict['randomforestclassifier'] = {'randomforestclassifier__n_estimators':[8,12]}
- params_dict['svc'] = {'svc__kernel':['linear','rbf','poly'],'svc__C':[.1,1,10]}
- ml_pipeline.set_hyperparameters(params_dict)
- credit_data = pd.read_csv('creditcard.csv').sample(20000)
- X = credit_data.drop(['Time','C'],axis=1)
- y = credit_data.C
- trainX, testX, trainY, testY = train_test_split(X,y)
- ml_pipeline.fit(X,y)
- ml_pipeline.score(testX,testY)
-
|