kaggle-2.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. # -*- coding: utf-8 -*-
  2. # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
  3. # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
  4. # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
  5. # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
  6. # Adding needed libraries and reading data
  7. import pandas as pd
  8. import numpy as np
  9. import matplotlib.pyplot as plt
  10. import seaborn as sns
  11. from sklearn import ensemble, tree, linear_model
  12. from sklearn.model_selection import train_test_split, cross_val_score
  13. from sklearn.metrics import r2_score, mean_squared_error
  14. from sklearn.utils import shuffle
  15. import xgboost as xgb
  16. import warnings
  17. warnings.filterwarnings('ignore')
  18. train = pd.read_csv("../../train.csv")
  19. test = pd.read_csv("../../test.csv")
  20. train.head()
  21. # Checking for missing data, showing every variable with at least one missing value in train set
  22. total_missing_data = train.isnull().sum().sort_values(ascending=False)
  23. missing_data_percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
  24. missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
  25. print(missing_data[missing_data['Percent']> 0])
  26. '''
  27. # Let's get rid of the missing data
  28. train = train.drop((missing_data[missing_data['Total'] > 0]).index,1)
  29. '''
  30. # Prints R2 and RMSE scores
  31. def get_score(prediction, labels):
  32. print('R2: {}'.format(r2_score(prediction, labels)))
  33. print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels))))
  34. print('RMSLE: {}'.format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean())))
  35. # Shows scores for train and validation sets
  36. def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
  37. prediction_train = estimator.predict(x_trn)
  38. # Printing estimator
  39. print(estimator)
  40. # Printing train scores
  41. get_score(prediction_train, y_trn)
  42. prediction_test = estimator.predict(x_tst)
  43. # Printing test scores
  44. print("Test")
  45. get_score(prediction_test, y_tst)
  46. # Splitting to features and lables and deleting variables I don't need
  47. train_labels = train.pop('SalePrice')
  48. # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
  49. features = pd.concat([train, test], keys=['train', 'test'])
  50. # I decided to get rid of features that have more than half of missing information or do not correlate to SalePrice
  51. features.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF',
  52. 'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF',
  53. 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'],
  54. axis=1, inplace=True)
  55. # MSSubClass as str
  56. features['MSSubClass'] = features['MSSubClass'].astype(str)
  57. # MSZoning NA in pred. filling with most popular values
  58. features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
  59. # LotFrontage NA in all. I suppose NA means 0
  60. features['LotFrontage'] = features['LotFrontage'].fillna(features['LotFrontage'].mean())
  61. # Alley NA in all. NA means no access
  62. features['Alley'] = features['Alley'].fillna('NOACCESS')
  63. # Converting OverallCond to str
  64. features.OverallCond = features.OverallCond.astype(str)
  65. # MasVnrType NA in all. filling with most popular values
  66. features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
  67. # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
  68. # NA in all. NA means No basement
  69. for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
  70. features[col] = features[col].fillna('NoBSMT')
  71. # TotalBsmtSF NA in pred. I suppose NA means 0
  72. features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
  73. # Electrical NA in pred. filling with most popular values
  74. features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
  75. # KitchenAbvGr to categorical
  76. features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
  77. # KitchenQual NA in pred. filling with most popular values
  78. features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
  79. # FireplaceQu NA in all. NA means No Fireplace
  80. features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP')
  81. # GarageType, GarageFinish, GarageQual NA in all. NA means No Garage
  82. for col in ('GarageType', 'GarageFinish', 'GarageQual'):
  83. features[col] = features[col].fillna('NoGRG')
  84. # GarageCars NA in pred. I suppose NA means 0
  85. features['GarageCars'] = features['GarageCars'].fillna(0.0)
  86. # SaleType NA in pred. filling with most popular values
  87. features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
  88. # Year and Month to categorical
  89. features['YrSold'] = features['YrSold'].astype(str)
  90. features['MoSold'] = features['MoSold'].astype(str)
  91. # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
  92. features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
  93. features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
  94. # Our SalesPrice is skewed right (check plot below). I'm logtransforming it.
  95. plt.figure(1)
  96. plt.clf()
  97. ax = sns.distplot(train_labels)
  98. #plt.show()
  99. ## Log transformation of labels
  100. train_labels = np.log(train_labels)
  101. ## Now it looks much better
  102. plt.figure(2)
  103. plt.clf()
  104. ax = sns.distplot(train_labels)
  105. #plt.show()
  106. ## Standardizing numeric features
  107. numeric_features = features.loc[:,['LotFrontage', 'LotArea', 'GrLivArea', 'TotalSF']]
  108. numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()
  109. #ax = sns.pairplot(numeric_features_standardized)
  110. # Getting Dummies from Condition1 and Condition2
  111. conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']])
  112. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(conditions))),
  113. index=features.index, columns=conditions)
  114. for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])):
  115. dummies.ix[i, cond] = 1
  116. features = pd.concat([features, dummies.add_prefix('Condition_')], axis=1)
  117. features.drop(['Condition1', 'Condition2'], axis=1, inplace=True)
  118. # Getting Dummies from Exterior1st and Exterior2nd
  119. exteriors = set([x for x in features['Exterior1st']] + [x for x in features['Exterior2nd']])
  120. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(exteriors))),
  121. index=features.index, columns=exteriors)
  122. for i, ext in enumerate(zip(features['Exterior1st'], features['Exterior2nd'])):
  123. dummies.ix[i, ext] = 1
  124. features = pd.concat([features, dummies.add_prefix('Exterior_')], axis=1)
  125. features.drop(['Exterior1st', 'Exterior2nd', 'Exterior_nan'], axis=1, inplace=True)
  126. # Getting Dummies from all other categorical vars
  127. for col in features.dtypes[features.dtypes == 'object'].index:
  128. for_dummy = features.pop(col)
  129. features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)
  130. ### Copying features
  131. features_standardized = features.copy()
  132. ### Replacing numeric features by standardized values
  133. features_standardized.update(numeric_features_standardized)
  134. ### Splitting features
  135. train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  136. test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  137. ### Splitting standardized features
  138. train_features_st = features_standardized.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  139. test_features_st = features_standardized.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  140. ### Shuffling train sets
  141. train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5)
  142. ### Splitting
  143. x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
  144. x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)
  145. '''
  146. Elastic Net
  147. '''
  148. ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st)
  149. train_test(ENSTest, x_train_st, x_test_st, y_train_st, y_test_st)
  150. # Average R2 score and standard deviation of 5-fold cross-validation
  151. scores = cross_val_score(ENSTest, train_features_st, train_labels, cv=5)
  152. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  153. '''
  154. Gradient Boosting
  155. '''
  156. GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
  157. min_samples_leaf=15, min_samples_split=10, loss='huber').fit(x_train, y_train)
  158. train_test(GBest, x_train, x_test, y_train, y_test)
  159. # Average R2 score and standart deviation of 5-fold cross-validation
  160. scores = cross_val_score(GBest, train_features_st, train_labels, cv=5)
  161. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  162. '''
  163. XGBoost
  164. '''
  165. XGBest = xgb.XGBRegressor(max_depth=3, learning_rate=0.05, n_estimators=3000).fit(x_train, y_train)
  166. train_test(XGBest, x_train, x_test, y_train, y_test)
  167. # Average R2 score and standart deviation of 5-fold cross-validation
  168. scores = cross_val_score(XGBest, train_features_st, train_labels, cv=5)
  169. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  170. # Retraining models
  171. GB_model = GBest.fit(train_features, train_labels)
  172. ENST_model = ENSTest.fit(train_features_st, train_labels)
  173. XGB_model = XGBest.fit(train_features, train_labels)
  174. ## Getting our SalePrice estimation
  175. Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st))
  176. + np.exp(XGB_model.predict(test_features))) / 3
  177. ## Saving to CSV
  178. pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('submission-2.csv', index =False)