practica3.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662
  1. # -*- coding: utf-8 -*-
  2. """
  3. Autor:
  4. Francisco Solano López Rodríguez
  5. Fecha:
  6. Noviembre/2018
  7. Contenido:
  8. Práctica 3
  9. Inteligencia de Negocio
  10. Grado en Ingeniería Informática
  11. Universidad de Granada
  12. """
  13. ''' -------------------- IMPORT LIBRARY -------------------- '''
  14. import pandas as pd
  15. import numpy as np
  16. import time
  17. import matplotlib.pyplot as plt
  18. import seaborn as sns
  19. from collections import Counter
  20. import datetime
  21. from sklearn.model_selection import StratifiedKFold, KFold
  22. from sklearn.model_selection import train_test_split, GridSearchCV
  23. from sklearn.feature_selection import VarianceThreshold
  24. from sklearn import ensemble
  25. ''' --- classifiers import --- '''
  26. from sklearn.linear_model import LogisticRegression
  27. from sklearn.ensemble import RandomForestClassifier
  28. from sklearn.ensemble import ExtraTreesClassifier
  29. from sklearn.neural_network import MLPClassifier
  30. from sklearn.neighbors import KNeighborsClassifier
  31. from sklearn import svm
  32. import xgboost as xgb
  33. import lightgbm as lgb
  34. from sklearn import tree
  35. from sklearn.svm import SVC, LinearSVC, NuSVC
  36. from sklearn.tree import DecisionTreeClassifier
  37. from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
  38. from sklearn.naive_bayes import GaussianNB
  39. from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
  40. from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
  41. from catboost import Pool, CatBoostClassifier
  42. ''' --- preprocessing import --- '''
  43. from sklearn import preprocessing
  44. from sklearn.preprocessing import OneHotEncoder
  45. from sklearn.preprocessing import LabelEncoder
  46. from sklearn.preprocessing import MinMaxScaler
  47. from sklearn.preprocessing import StandardScaler
  48. from sklearn.preprocessing import PolynomialFeatures
  49. from sklearn.preprocessing import scale
  50. from sklearn.preprocessing import Normalizer
  51. ''' --- metrics import --- '''
  52. from sklearn import metrics
  53. from sklearn.metrics import roc_curve, auc
  54. from sklearn.metrics import confusion_matrix
  55. from sklearn.metrics import accuracy_score
  56. from math import sin, cos, sqrt, atan2, radians
  57. # Obtener datos respecto a la fecha y obtener la edad del pozo
  58. def date_parser(df):
  59. date_recorder = list(map(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d'),
  60. df['date_recorded'].values))
  61. df['year_recorder'] = list(map(lambda x: int(x.strftime('%Y')), date_recorder))
  62. df['weekday_recorder'] = list(map(lambda x: int(x.strftime('%w')), date_recorder))
  63. df['yearly_week_recorder'] = list(map(lambda x: int(x.strftime('%W')), date_recorder))
  64. df['month_recorder'] = list(map(lambda x: int(x.strftime('%m')), date_recorder))
  65. df['age'] = df['year_recorder'].values - df['construction_year'].values
  66. del df['date_recorded']
  67. return df
  68. # Obtener a distancia a la coordenada (0,0)
  69. def distancia(lon1, lat1, lon2, lat2):
  70. dlon = lon2 - lon1
  71. dlat = lat2 - lat1
  72. a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
  73. c = 2 * atan2(sqrt(a), sqrt(1 - a))
  74. R = 6371
  75. return R * c
  76. # Obtener la coordenada cartesiana x a partir de las longitud y la latitud
  77. def cartesian_x(lon, lat):
  78. lat=radians(lat)
  79. lon=radians(lon)
  80. R=6371.0
  81. x = R * cos(lat) * cos(lon)
  82. return x
  83. # Obtener la coordenada cartesiana y a partir de las longitud y la latitud
  84. def cartesian_y(lon, lat):
  85. lat=radians(lat)
  86. lon=radians(lon)
  87. R=6371.0
  88. y = R * cos(lat) * sin(lon)
  89. return y
  90. # Matriz de confusion
  91. def plot_confusion_matrix(y_test, predictions):
  92. cm = metrics.confusion_matrix(y_test, predictions)
  93. plt.figure(figsize=(9,9))
  94. sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True)
  95. plt.ylabel('Actual label')
  96. plt.xlabel('Predicted label')
  97. plt.show()
  98. # Funcion para realizar la validacion cruzada
  99. def cross_validation(clf, X, y, cv = None, min_max_scaler = False, scaled = False, standard_scaler = False, normalizer = False, poly = False, m_confusion = False):
  100. if cv == None:
  101. cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456)
  102. iteration = 0
  103. for train, test in cv.split(X, y):
  104. X_train, X_test = X[train], X[test]
  105. y_train, y_test = y[train], y[test]
  106. if min_max_scaler:
  107. X_train = MinMaxScaler().fit_transform(X_train)
  108. X_test = MinMaxScaler().fit_transform(X_test)
  109. if scaled:
  110. X_train = scale(X_train)
  111. X_test = scale(X_test)
  112. if poly:
  113. X_train = PolynomialFeatures(degree = 2, interaction_only=True).fit_transform(X_train)
  114. X_test = PolynomialFeatures(degree = 2, interaction_only=True).fit_transform(X_test)
  115. if standard_scaler:
  116. transformer = StandardScaler().fit(X_train)
  117. X_train = transformer.transform(X_train)
  118. X_test = transformer.transform(X_test)
  119. if normalizer:
  120. transformer = Normalizer().fit(X_train)
  121. X_train = transformer.transform(X_train)
  122. X_test = transformer.transform(X_test)
  123. t = time.time()
  124. clf = clf.fit(X_train,y_train)
  125. training_time = time.time() - t
  126. predictions_train = clf.predict(X_train)
  127. predictions = clf.predict(X_test)
  128. print("--------- Iteración ", iteration, " --------- ")
  129. print("Tiempo :: ", training_time)
  130. print ("Train Accuracy :: ", accuracy_score(y_train, predictions_train))
  131. print ("Test Accuracy :: ", accuracy_score(y_test, predictions))
  132. print("")
  133. if m_confusion:
  134. plot_confusion_matrix(y_test, predictions)
  135. iteration += 1
  136. ''' ------------------------------------------------------------------ '''
  137. ''' --------------------------- READ DATA ---------------------------- '''
  138. ''' ------------------------------------------------------------------ '''
  139. print("\nWATER PUMP COMPETITION\n")
  140. print("Leyendo datos...")
  141. #los ficheros .csv se han preparado previamente para sustituir ,, y "Not known" por NaN (valores perdidos)
  142. data_x_orig = pd.read_csv('data/water_pump_tra.csv')
  143. data_y = pd.read_csv('data/water_pump_tra_target.csv')
  144. data_x_tst = pd.read_csv('data/water_pump_tst.csv')
  145. print(data_x_orig.shape)
  146. print(data_x_tst.shape)
  147. print("Lectura completada.\n")
  148. ''' ------------------------------------------------------------------ '''
  149. ''' -------------------------- LOOK AT DATA -------------------------- '''
  150. ''' ------------------------------------------------------------------ '''
  151. print('Viendo los datos:\n')
  152. data_x = data_x_orig
  153. print('num_private:')
  154. print(data_x['num_private'].value_counts()[0:3])
  155. print('recorded_by:')
  156. print(data_x['recorded_by'].value_counts())
  157. print(data_y.status_group.value_counts()/len(data_y))
  158. data_y.status_group.value_counts().plot(kind='bar')
  159. plt.xticks(rotation = 0)
  160. plt.show()
  161. print('Ejemplos con longitude = 0')
  162. print(len(data_x.ix[data_x['longitude']==0,'longitude']))
  163. print('Ejemplos con latitude = 0')
  164. print(len(data_x.ix[data_x['latitude']==-0.00000002,'latitude']))
  165. print('Ejemplos con construction_year = 0')
  166. print(len(data_x.ix[data_x['construction_year']==0,'construction_year']))
  167. corr = data_x.corr()
  168. sns.heatmap (corr)
  169. plt.xticks(rotation =45)
  170. plt.show()
  171. print("Valores perdidos:")
  172. print(data_x.isnull().sum())
  173. data_x.isnull().sum().plot.bar()
  174. plt.show()
  175. print('funder:\n')
  176. print(data_x['funder'].value_counts()[0:6])
  177. print('\ninstaller:\n')
  178. print(data_x['installer'].value_counts()[0:6])
  179. print('\npublic_meeting:\n')
  180. print(data_x['public_meeting'].value_counts()[0:6])
  181. print('\nscheme_management:\n')
  182. print(data_x['scheme_management'].value_counts()[0:6])
  183. print('\npermit:\n')
  184. print(data_x['permit'].value_counts()[0:6])
  185. print('\nsubvillage:\n')
  186. print(data_x['subvillage'].value_counts()[0:6])
  187. print('\nwpt_name:\n')
  188. print(data_x['wpt_name'].value_counts()[0:6])
  189. '''
  190. data_x['funder'].value_counts()[0:10].plot.bar()
  191. plt.show()
  192. data_x['installer'].value_counts().plot.bar()
  193. plt.show()
  194. data_x['public_meeting'].value_counts().plot.bar()
  195. plt.show()
  196. data_x['scheme_management'].value_counts().plot.bar()
  197. plt.show()
  198. data_x['permit'].value_counts().plot.bar()
  199. plt.show()
  200. data_x['subvillage'].value_counts().plot.bar()
  201. plt.show()
  202. data_x['wpt_name'].value_counts().plot.bar()
  203. plt.show()
  204. '''
  205. ''' ------------------------------------------------------------------ '''
  206. ''' ------------------------- PREPROCESSING -------------------------- '''
  207. ''' ------------------------------------------------------------------ '''
  208. print("\nPreprocesando datos...")
  209. data_x=data_x_orig.append(data_x_tst)
  210. ''' ------------------ DROP COLUMNS ------------------ '''
  211. print(" Borrando columnas...")
  212. columns_to_drop = ['id', 'num_private', 'recorded_by', 'scheme_name']
  213. data_x.drop(labels=columns_to_drop, axis=1, inplace = True)
  214. data_y.drop(labels=['id'], axis=1,inplace = True)
  215. ''' ------------------ MISSING VALUES ------------------ '''
  216. print(" Modificando valores nan...")
  217. data_x['funder'] = data_x['funder'].fillna('Government Of Tanzania')
  218. data_x['installer'] = data_x['installer'].fillna('DWE')
  219. data_x['public_meeting'] = data_x['public_meeting'].fillna(True)
  220. data_x['scheme_management'] = data_x['scheme_management'].fillna('VWC')
  221. data_x['permit'] = data_x['permit'].fillna(True)
  222. data_x['subvillage'] = data_x['subvillage'].fillna('Unknown')
  223. data_x['wpt_name'] = data_x['wpt_name'].fillna('none')
  224. data_x.ix[data_x['latitude']>-0.1,'latitude']=None
  225. data_x.ix[data_x['longitude']==0,'longitude']=None
  226. data_x["longitude"] = data_x.groupby("region_code").transform(lambda x: x.fillna(x.median())).longitude
  227. data_x["latitude"] = data_x.groupby("region_code").transform(lambda x: x.fillna(x.median())).latitude
  228. data_x.construction_year=pd.to_numeric(data_x.construction_year)
  229. data_x.loc[data_x.construction_year <= 0, data_x.columns=='construction_year'] = 1950
  230. # mean() tarda mucho, pero mejora un poco los resultados con respecto a median()
  231. #data_x=data_x.fillna(data_x.mean())
  232. #data_x = data_x.fillna(data_x.median())
  233. ''' ------------------ RARE VALUES ------------------ '''
  234. print(" Etiquetando casos raros...")
  235. columns_other = [x for x in data_x.columns if x not in ['latitude','longitude','gps_height','age','population','construction_year','month_recorder']]
  236. for col in columns_other:
  237. value_counts = data_x[col].value_counts()
  238. lessthen = value_counts[value_counts < 20]
  239. listnow = data_x.installer.isin(list(lessthen.keys()))
  240. data_x.loc[listnow,col] = 'Others'
  241. ''' ------------------ CARTESIAN ------------------ '''
  242. print(" Preprocesando coordenadas y distancias...")
  243. data_x['dist'] = data_x.apply(lambda row: distancia(row['longitude'], row['latitude'], 0, 0), axis=1)
  244. data_x['cartesian_x'] = data_x.apply(lambda row: cartesian_x(row['longitude'], row['latitude']), axis=1)
  245. data_x['cartesian_y'] = data_x.apply(lambda row: cartesian_y(row['longitude'], row['latitude']), axis=1)
  246. data_x.drop(labels=['longitude', 'latitude'], axis=1, inplace = True)
  247. ''' ------------------ DATES ------------------ '''
  248. print(" Preprocesando fechas...")
  249. data_x = date_parser(data_x)
  250. data_x.population = data_x.population.apply(lambda x: np.log10(x+1))
  251. print(" Convirtiendo categóricas a numéricas...")
  252. data_x = data_x.astype(str).apply(LabelEncoder().fit_transform)
  253. data_x_tst = data_x[len(data_x_orig):]
  254. data_x = data_x[:len(data_x_orig)]
  255. X = data_x.values
  256. y = np.ravel(data_y.values)
  257. #y = le.fit(y).transform(y)
  258. X_tst = data_x_tst.values
  259. print("Datos preprocesados con éxito.\n")
  260. ''' -------------------- CROSS VALIDATION -------------------- '''
  261. '''
  262. print("Validación cruzada:\n")
  263. print('\nKNN\n')
  264. knn = KNeighborsClassifier(n_neighbors=5)
  265. cross_validation(clf=knn, X = X, y = y, cv = None, min_max_scaler = True)
  266. print('\nXGB\n')
  267. clf = xgb.XGBClassifier(n_estimators = 200)
  268. cross_validation(clf, X, y)
  269. print('\nLGB\n')
  270. clf = lgb.LGBMClassifier(objective='binary', n_estimators=200, num_leaves=31)
  271. cross_validation(clf, X, y)
  272. print('\nRandomForest\n')
  273. clf = RandomForestClassifier(n_estimators=125, max_depth = 20, random_state = 10)
  274. cross_validation(clf, X, y)
  275. print('\nExtraTreesClassifier\n')
  276. clf = ExtraTreesClassifier(n_estimators = 125, max_depth = 20)
  277. cross_validation(clf, X, y)
  278. '''
  279. ''' -------------------- SUBMISSION 1 -------------------- '''
  280. '''
  281. clf = xgb.XGBClassifier(n_estimators = 200)
  282. clf = clf.fit(X,y)
  283. y_pred_tst = clf.predict(X_tst)
  284. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  285. df_submission['status_group'] = y_pred_tst
  286. df_submission.to_csv("submission1.csv", index=False)
  287. '''
  288. ''' ---------------------------------------------------- '''
  289. ''' -------------------- SUBMISSION 2 -------------------- '''
  290. '''
  291. clf = RandomForestClassifier(n_estimators = 125)
  292. clf = clf.fit(X,y)
  293. y_pred_tst = clf.predict(X_tst)
  294. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  295. df_submission['status_group'] = y_pred_tst
  296. df_submission.to_csv("submission2.csv", index=False)
  297. '''
  298. ''' ---------------------------------------------------- '''
  299. ''' -------------------- SUBMISSION 3 -------------------- '''
  300. '''
  301. clf = RandomForestClassifier()
  302. clf = clf.fit(X,y)
  303. y_pred_tst = clf.predict(X_tst)
  304. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  305. df_submission['status_group'] = y_pred_tst
  306. df_submission.to_csv("submission3.csv", index=False)
  307. '''
  308. ''' ---------------------------------------------------- '''
  309. ''' -------------------- SUBMISSION 6 -------------------- '''
  310. '''
  311. # Eliminated features:
  312. # 'num_private', 'recorded_by', 'region', 'scheme_name', 'scheme_management'
  313. clf = RandomForestClassifier(max_features = 'sqrt', n_estimators = 500, random_state=10)
  314. clf = clf.fit(X,y)
  315. y_pred_tst = clf.predict(X_tst)
  316. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  317. df_submission['status_group'] = y_pred_tst
  318. df_submission.to_csv("submission6.csv", index=False)
  319. '''
  320. ''' ---------------------------------------------------- '''
  321. ''' -------------------- SUBMISSION 8 -------------------- '''
  322. '''
  323. print("Submission 8")
  324. clf = RandomForestClassifier(max_features = 'sqrt', n_estimators = 200, max_depth = 20)
  325. clf = clf.fit(X,y)
  326. y_pred_tst = clf.predict(X_tst)
  327. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  328. df_submission['status_group'] = y_pred_tst
  329. df_submission.to_csv("submission9.csv", index=False)
  330. '''
  331. ''' ---------------------------------------------------- '''
  332. ''' -------------------- SUBMISSION 11 -------------------- '''
  333. '''
  334. print("Submission 11")
  335. clf = RandomForestClassifier(n_estimators=200, max_depth = 20, random_state = 10)
  336. clf = clf.fit(X,y)
  337. y_pred_tst = clf.predict(X_tst)
  338. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  339. df_submission['status_group'] = y_pred_tst
  340. df_submission.to_csv("submission11.csv", index=False)
  341. '''
  342. ''' -------------------- SUBMISSION 12 -------------------- '''
  343. '''
  344. print("Submission 12")
  345. clf = RandomForestClassifier(n_estimators=125, max_depth = 20)
  346. clf = clf.fit(X,y)
  347. y_pred_tst = clf.predict(X_tst)
  348. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  349. df_submission['status_group'] = y_pred_tst
  350. df_submission.to_csv("submission12.csv", index=False)
  351. '''
  352. ''' -------------------- SUBMISSION 13 -------------------- '''
  353. '''
  354. print("Submission 13")
  355. fit_rf = RandomForestClassifier(max_features = 'sqrt', max_depth=20)
  356. estimators = range(25,201,25)
  357. param_dist = {'n_estimators': estimators}
  358. clf= GridSearchCV(fit_rf, cv = 5, scoring = 'accuracy', param_grid=param_dist, n_jobs = 3)
  359. clf = clf.fit(X,y)
  360. y_pred_tst = clf.predict(X_tst)
  361. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  362. df_submission['status_group'] = y_pred_tst
  363. df_submission.to_csv("submission13.csv", index=False)
  364. '''
  365. ''' -------------------- SUBMISSION 15 -------------------- '''
  366. '''
  367. print("Submission 15")
  368. clf = RandomForestClassifier(n_estimators=125, max_depth = 22)
  369. clf = clf.fit(X,y)
  370. y_pred_tst = clf.predict(X_tst)
  371. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  372. df_submission['status_group'] = y_pred_tst
  373. df_submission.to_csv("submission15.csv", index=False)
  374. '''
  375. ''' -------------------- SUBMISSION 16 -------------------- '''
  376. '''
  377. print("Submission 16")
  378. clf = RandomForestClassifier(n_estimators=500)
  379. clf = clf.fit(X,y)
  380. y_pred_tst = clf.predict(X_tst)
  381. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  382. df_submission['status_group'] = y_pred_tst
  383. df_submission.to_csv("submission16.csv", index=False)
  384. # Nota: este experimento empeora los resultados, posible sobreentrenamiento
  385. '''
  386. ''' -------------------- SUBMISSION 17 -------------------- '''
  387. '''
  388. print("Submission 17")
  389. clf = RandomForestClassifier(n_estimators=120, max_depth = 20)
  390. clf = clf.fit(X,y)
  391. y_pred_tst = clf.predict(X_tst)
  392. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  393. df_submission['status_group'] = y_pred_tst
  394. df_submission.to_csv("submission17.csv", index=False)
  395. '''
  396. ''' -------------------- SUBMISSION 18 -------------------- '''
  397. '''
  398. # fillnan() with more repeated
  399. print("Submission 18")
  400. clf = RandomForestClassifier(n_estimators=160, max_depth = 20)
  401. clf = clf.fit(X,y)
  402. y_pred_tst = clf.predict(X_tst)
  403. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  404. df_submission['status_group'] = y_pred_tst
  405. df_submission.to_csv("submission18.csv", index=False)
  406. '''
  407. ''' -------------------- SUBMISSION 19 -------------------- '''
  408. '''
  409. # fillnan() with more repeated
  410. print("Submission 19")
  411. clf = RandomForestClassifier(n_estimators=150, max_depth = 20)
  412. clf = clf.fit(X,y)
  413. y_pred_tst = clf.predict(X_tst)
  414. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  415. df_submission['status_group'] = y_pred_tst
  416. df_submission.to_csv("submission19.csv", index=False)
  417. '''
  418. ''' -------------------- SUBMISSION 22 -------------------- '''
  419. '''
  420. print("Submission 22")
  421. fit_rf = RandomForestClassifier(max_features = 'sqrt', max_depth=20)
  422. estimators = range(25,201,25)
  423. param_dist = {'n_estimators': estimators}
  424. clf= GridSearchCV(fit_rf, cv = 5, scoring = 'accuracy', param_grid=param_dist, n_jobs = 3)
  425. clf = clf.fit(X,y)
  426. y_pred_tst = clf.predict(X_tst)
  427. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  428. df_submission['status_group'] = y_pred_tst
  429. df_submission.to_csv("submission22.csv", index=False)
  430. best_param = clf.best_params_['n_estimators']
  431. print ("Mejor valor para n_estimators: ", best_param)
  432. '''
  433. ''' -------------------- SUBMISSION 23 -------------------- '''
  434. '''
  435. print("Submission 23")
  436. fit_rf = RandomForestClassifier(max_features = 'sqrt', max_depth=25)
  437. estimators = range(100,1101,25)
  438. param_dist = {'n_estimators': estimators}
  439. clf= GridSearchCV(fit_rf, cv = 5, scoring = 'accuracy', param_grid=param_dist, n_jobs = 3)
  440. clf = clf.fit(X,y)
  441. y_pred_tst = clf.predict(X_tst)
  442. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  443. df_submission['status_group'] = y_pred_tst
  444. df_submission.to_csv("submission23.csv", index=False)
  445. best_param = clf.best_params_['n_estimators']
  446. print ("Mejor valor para n_estimators: ", best_param)
  447. '''
  448. ''' -------------------- SUBMISSION 24 -------------------- '''
  449. '''
  450. print("Submission 24")
  451. clf = RandomForestClassifier(n_estimators=100, max_depth = 20)
  452. clf = clf.fit(X,y)
  453. y_pred_tst = clf.predict(X_tst)
  454. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  455. df_submission['status_group'] = y_pred_tst
  456. df_submission.to_csv("submission24.csv", index=False)
  457. '''
  458. ''' -------------------- SUBMISSION 25 -------------------- '''
  459. '''
  460. print("Submission 25")
  461. clf = RandomForestClassifier(n_estimators=150, max_depth = 20)
  462. clf = clf.fit(X,y)
  463. y_pred_tst = clf.predict(X_tst)
  464. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  465. df_submission['status_group'] = y_pred_tst
  466. df_submission.to_csv("submission25.csv", index=False)
  467. '''
  468. ''' ------------------- FINAL SUBMISSION ------------------ '''
  469. ''' -------------------- SUBMISSION 26 -------------------- '''
  470. print("Submission 26")
  471. clf = RandomForestClassifier(n_estimators = 125, max_depth = 20)
  472. clf = clf.fit(X,y)
  473. y_pred_tst = clf.predict(X_tst)
  474. df_submission = pd.read_csv('data/water_pump_submissionformat.csv')
  475. df_submission['status_group'] = y_pred_tst
  476. df_submission.to_csv("submission26.csv", index=False)