kaggle-9.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. # -*- coding: utf-8 -*-
  2. # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
  3. # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
  4. # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
  5. # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
  6. # Also, part of the preprocessing has been inspired by this kernel from Serigne
  7. # https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
  8. # Adding needed libraries and reading data
  9. import pandas as pd
  10. import numpy as np
  11. import matplotlib.pyplot as plt
  12. import seaborn as sns
  13. from sklearn import ensemble, tree, linear_model, preprocessing
  14. from sklearn.preprocessing import LabelEncoder, RobustScaler
  15. from sklearn.linear_model import ElasticNet, Lasso
  16. from sklearn.kernel_ridge import KernelRidge
  17. from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
  18. from sklearn.model_selection import KFold, train_test_split, cross_val_score
  19. from sklearn.metrics import r2_score, mean_squared_error
  20. from sklearn.pipeline import make_pipeline
  21. from sklearn.utils import shuffle
  22. from scipy import stats
  23. from scipy.stats import norm, skew, boxcox
  24. from scipy.special import boxcox1p
  25. import xgboost as xgb
  26. import lightgbm as lgb
  27. import warnings
  28. warnings.filterwarnings('ignore')
  29. # Class AveragingModels
  30. # This class is Serigne's simplest way of stacking the prediction models, by
  31. # averaging them. We are going to use it as it represents the same that we have
  32. # been using in the late submissions, but this applies perfectly to rmsle_cv function.
  33. class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
  34. def __init__(self, models):
  35. self.models = models
  36. # we define clones of the original models to fit the data in
  37. def fit(self, X, y):
  38. self.models_ = [clone(x) for x in self.models]
  39. # Train cloned base models
  40. for model in self.models_:
  41. model.fit(X, y)
  42. return self
  43. #Now we do the predictions for cloned models and average them
  44. def predict(self, X):
  45. predictions = np.column_stack([
  46. model.predict(X) for model in self.models_
  47. ])
  48. return np.mean(predictions, axis=1)
  49. train = pd.read_csv("../../train.csv")
  50. test = pd.read_csv("../../test.csv")
  51. #Save the 'Id' column
  52. train_ID = train['Id']
  53. test_ID = test['Id']
  54. train.drop('Id', axis=1, inplace=True)
  55. test.drop('Id', axis=1, inplace=True)
  56. # Visualizing outliers
  57. fig, ax = plt.subplots()
  58. ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
  59. plt.ylabel('SalePrice', fontsize=13)
  60. plt.xlabel('GrLivArea', fontsize=13)
  61. #plt.show()
  62. # Now the outliers can be deleted
  63. train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)
  64. #Check the graphic again, making sure there are no outliers left
  65. fig, ax = plt.subplots()
  66. ax.scatter(train['GrLivArea'], train['SalePrice'])
  67. plt.ylabel('SalePrice', fontsize=13)
  68. plt.xlabel('GrLivArea', fontsize=13)
  69. #plt.show()
  70. #We use the numpy fuction log1p which applies log(1+x) to all elements of the column
  71. train["SalePrice"] = np.log1p(train["SalePrice"])
  72. #Check the new distribution
  73. sns.distplot(train['SalePrice'] , fit=norm);
  74. # Get the fitted parameters used by the function
  75. (mu, sigma) = norm.fit(train['SalePrice'])
  76. print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
  77. #Now plot the distribution
  78. plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
  79. loc='best')
  80. plt.ylabel('Frequency')
  81. plt.title('SalePrice distribution')
  82. #Get also the QQ-plot
  83. fig = plt.figure()
  84. res = stats.probplot(train['SalePrice'], plot=plt)
  85. #plt.show()
  86. # Splitting to features and labels
  87. train_labels = train.pop('SalePrice')
  88. # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
  89. features = pd.concat([train, test], keys=['train', 'test'])
  90. # Checking for missing data, showing every variable with at least one missing value in train set
  91. total_missing_data = features.isnull().sum().sort_values(ascending=False)
  92. missing_data_percent = (features.isnull().sum()/features.isnull().count()).sort_values(ascending=False)
  93. missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
  94. print(missing_data[missing_data['Percent']> 0])
  95. # Deleting non-interesting variables for this case study
  96. features.drop(['Utilities'], axis=1, inplace=True)
  97. # Imputing missing values and transforming certain columns
  98. # Converting OverallCond to str
  99. features.OverallCond = features.OverallCond.astype(str)
  100. # MSSubClass as str
  101. features['MSSubClass'] = features['MSSubClass'].astype(str)
  102. # MSZoning NA in pred. filling with most popular values
  103. features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
  104. # LotFrontage NA filling with median according to its OverallQual value
  105. median = features.groupby('OverallQual')['LotFrontage'].transform('median')
  106. features['LotFrontage'] = features['LotFrontage'].fillna(median)
  107. # Alley NA in all. NA means no access
  108. features['Alley'] = features['Alley'].fillna('NOACCESS')
  109. # MasVnrArea NA filling with median according to its OverallQual value
  110. median = features.groupby('OverallQual')['MasVnrArea'].transform('median')
  111. features['MasVnrArea'] = features['MasVnrArea'].fillna(median)
  112. # MasVnrType NA in all. filling with most popular values
  113. features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
  114. # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
  115. # NA in all. NA means No basement
  116. for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
  117. features[col] = features[col].fillna('NoBSMT')
  118. # TotalBsmtSF NA in pred. I suppose NA means 0
  119. features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
  120. # Electrical NA in pred. filling with most popular values
  121. features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
  122. # KitchenAbvGr to categorical
  123. features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
  124. # KitchenQual NA in pred. filling with most popular values
  125. features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
  126. # FireplaceQu NA in all. NA means No Fireplace
  127. features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP')
  128. # Garage-like features NA in all. NA means No Garage
  129. for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageYrBlt', 'GarageCond'):
  130. features[col] = features[col].fillna('NoGRG')
  131. # GarageCars and GarageArea NA in pred. I suppose NA means 0
  132. for col in ('GarageCars', 'GarageArea'):
  133. features[col] = features[col].fillna(0.0)
  134. # SaleType NA in pred. filling with most popular values
  135. features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
  136. # PoolQC NA in all. NA means No Pool
  137. features['PoolQC'] = features['PoolQC'].fillna('NoPool')
  138. # MiscFeature NA in all. NA means None
  139. features['MiscFeature'] = features['MiscFeature'].fillna('None')
  140. # Fence NA in all. NA means no fence
  141. features['Fence'] = features['Fence'].fillna('NoFence')
  142. # BsmtHalfBath and BsmtFullBath NA means 0
  143. for col in ('BsmtHalfBath', 'BsmtFullBath'):
  144. features[col] = features[col].fillna(0)
  145. # Functional NA means Typ
  146. features['Functional'] = features['Functional'].fillna('Typ')
  147. # NA in Bsmt SF variables means not that type of basement, 0 square feet
  148. for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF'):
  149. features[col] = features[col].fillna(0)
  150. # NA in Exterior1st and Exterior2nd filled with the most common value
  151. for col in ('Exterior1st', 'Exterior2nd'):
  152. features[col] = features[col].fillna(features[col].mode()[0])
  153. # Year and Month to categorical
  154. features['YrSold'] = features['YrSold'].astype(str)
  155. features['MoSold'] = features['MoSold'].astype(str)
  156. # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
  157. features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
  158. #features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
  159. # Box-cox transformation to most skewed features
  160. numeric_feats = features.dtypes[features.dtypes != "object"].index
  161. # Check the skew of all numerical features
  162. skewed_feats = features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
  163. print("\nSkew in numerical features:")
  164. skewness = pd.DataFrame({'Skew' :skewed_feats})
  165. skewness.head(10)
  166. # Box-cox
  167. skewness = skewness[abs(skewness) > 0.75]
  168. print("There are {} skewed numerical features to Box Cox transform\n".format(skewness.shape[0]))
  169. from scipy.special import boxcox1p
  170. skewed_features = skewness.index
  171. lam = 0.15
  172. for feat in skewed_features:
  173. features[feat] = boxcox1p(features[feat], lam)
  174. # Label encoding to some categorical features
  175. categorical_features = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
  176. 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
  177. 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
  178. 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
  179. 'YrSold', 'MoSold')
  180. lbl = LabelEncoder()
  181. for col in categorical_features:
  182. lbl.fit(list(features[col].values))
  183. features[col] = lbl.transform(list(features[col].values))
  184. # Getting Dummies
  185. features = pd.get_dummies(features)
  186. # Splitting features
  187. train_features = features.loc['train'].select_dtypes(include=[np.number]).values
  188. test_features = features.loc['test'].select_dtypes(include=[np.number]).values
  189. # Validation function
  190. n_folds = 5
  191. def rmsle_cv(model):
  192. kf = KFold(n_folds, shuffle=True, random_state=101010).get_n_splits(train_features)
  193. rmse= np.sqrt(-cross_val_score(model, train_features, train_labels, scoring="neg_mean_squared_error", cv = kf))
  194. return(rmse)
  195. # Modelling
  196. enet_model = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=101010))
  197. #print(enet_model)
  198. #score = rmsle_cv(enet_model)
  199. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  200. gb_model = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
  201. max_depth=4, max_features='sqrt',
  202. min_samples_leaf=15, min_samples_split=10,
  203. loss='huber', random_state =101010)
  204. #print(gb_model)
  205. #score = rmsle_cv(gb_model)
  206. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  207. lasso_model = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=101010))
  208. #print(lasso_model)
  209. #score = rmsle_cv(lasso_model)
  210. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  211. krr_model = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
  212. #print(krr_model)
  213. #score = rmsle_cv(krr_model)
  214. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  215. # Now let's check how do the averaged models work
  216. averaged_models = AveragingModels(models = (gb_model, enet_model, lasso_model, krr_model))
  217. print("AVERAGED MODELS")
  218. score = rmsle_cv(averaged_models)
  219. print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  220. # Getting our SalePrice estimation
  221. averaged_models.fit(train_features, train_labels)
  222. final_labels = np.exp(averaged_models.predict(test_features))
  223. # Saving to CSV
  224. pd.DataFrame({'Id': test_ID, 'SalePrice': final_labels}).to_csv('submission-9.csv', index =False)