kaggle-12.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. # -*- coding: utf-8 -*-
  2. # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
  3. # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
  4. # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
  5. # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
  6. # Also, part of the preprocessing and modelling has been inspired by this kernel from Serigne
  7. # https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
  8. # And this kernel from juliencs has been pretty helpful too!
  9. # https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset
  10. # Adding needed libraries and reading data
  11. import pandas as pd
  12. import numpy as np
  13. import matplotlib.pyplot as plt
  14. import seaborn as sns
  15. from sklearn import ensemble, tree, linear_model, preprocessing
  16. from sklearn.preprocessing import LabelEncoder, RobustScaler
  17. from sklearn.linear_model import ElasticNet, Lasso
  18. from sklearn.kernel_ridge import KernelRidge
  19. from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
  20. from sklearn.model_selection import KFold, train_test_split, cross_val_score
  21. from sklearn.metrics import r2_score, mean_squared_error
  22. from sklearn.pipeline import make_pipeline
  23. from sklearn.utils import shuffle
  24. from scipy import stats
  25. from scipy.stats import norm, skew, boxcox
  26. from scipy.special import boxcox1p
  27. import xgboost as xgb
  28. import lightgbm as lgb
  29. import warnings
  30. warnings.filterwarnings('ignore')
  31. # Class AveragingModels
  32. # This class is Serigne's simplest way of stacking the prediction models, by
  33. # averaging them. We are going to use it as it represents the same that we have
  34. # been using in the late submissions, but this applies perfectly to rmsle_cv function.
  35. class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
  36. def __init__(self, models):
  37. self.models = models
  38. # we define clones of the original models to fit the data in
  39. def fit(self, X, y):
  40. self.models_ = [clone(x) for x in self.models]
  41. # Train cloned base models
  42. for model in self.models_:
  43. model.fit(X, y)
  44. return self
  45. #Now we do the predictions for cloned models and average them
  46. def predict(self, X):
  47. predictions = np.column_stack([
  48. model.predict(X) for model in self.models_
  49. ])
  50. return np.mean(predictions, axis=1)
  51. train = pd.read_csv("../../train.csv")
  52. test = pd.read_csv("../../test.csv")
  53. #Save the 'Id' column
  54. train_ID = train['Id']
  55. test_ID = test['Id']
  56. train.drop('Id', axis=1, inplace=True)
  57. test.drop('Id', axis=1, inplace=True)
  58. # Visualizing outliers
  59. fig, ax = plt.subplots()
  60. ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
  61. plt.ylabel('SalePrice', fontsize=13)
  62. plt.xlabel('GrLivArea', fontsize=13)
  63. #plt.show()
  64. # Now the outliers can be deleted
  65. train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)
  66. #Check the graphic again, making sure there are no outliers left
  67. fig, ax = plt.subplots()
  68. ax.scatter(train['GrLivArea'], train['SalePrice'])
  69. plt.ylabel('SalePrice', fontsize=13)
  70. plt.xlabel('GrLivArea', fontsize=13)
  71. #plt.show()
  72. #We use the numpy fuction log1p which applies log(1+x) to all elements of the column
  73. train["SalePrice"] = np.log1p(train["SalePrice"])
  74. #Check the new distribution
  75. sns.distplot(train['SalePrice'] , fit=norm);
  76. # Get the fitted parameters used by the function
  77. (mu, sigma) = norm.fit(train['SalePrice'])
  78. print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
  79. #Now plot the distribution
  80. plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
  81. loc='best')
  82. plt.ylabel('Frequency')
  83. plt.title('SalePrice distribution')
  84. #Get also the QQ-plot
  85. fig = plt.figure()
  86. res = stats.probplot(train['SalePrice'], plot=plt)
  87. #plt.show()
  88. # Splitting to features and labels
  89. train_labels = train.pop('SalePrice')
  90. # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
  91. features = pd.concat([train, test], keys=['train', 'test'])
  92. # Checking for missing data, showing every variable with at least one missing value in train set
  93. total_missing_data = features.isnull().sum().sort_values(ascending=False)
  94. missing_data_percent = (features.isnull().sum()/features.isnull().count()).sort_values(ascending=False)
  95. missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
  96. print(missing_data[missing_data['Percent']> 0])
  97. # Deleting non-interesting variables for this case study
  98. features.drop(['Utilities'], axis=1, inplace=True)
  99. # Imputing missing values and transforming certain columns
  100. # Converting OverallCond to str
  101. features.OverallCond = features.OverallCond.astype(str)
  102. # MSSubClass as str
  103. features['MSSubClass'] = features['MSSubClass'].astype(str)
  104. # MSZoning NA in pred. filling with most popular values
  105. features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
  106. # LotFrontage NA filling with median according to its OverallQual value
  107. median = features.groupby('OverallQual')['LotFrontage'].transform('median')
  108. features['LotFrontage'] = features['LotFrontage'].fillna(median)
  109. # Alley NA in all. NA means no access
  110. features['Alley'] = features['Alley'].fillna('NoAccess')
  111. # MasVnrArea NA filling with median according to its OverallQual value
  112. median = features.groupby('OverallQual')['MasVnrArea'].transform('median')
  113. features['MasVnrArea'] = features['MasVnrArea'].fillna(median)
  114. # MasVnrType NA in all. filling with most popular values
  115. features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
  116. # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
  117. # NA in all. NA means No basement
  118. for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
  119. features[col] = features[col].fillna('NoBsmt')
  120. # TotalBsmtSF NA in pred. I suppose NA means 0
  121. features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
  122. # Electrical NA in pred. filling with most popular values
  123. features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
  124. # KitchenAbvGr to categorical
  125. features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
  126. # KitchenQual NA in pred. filling with most popular values
  127. features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
  128. # FireplaceQu NA in all. NA means No Fireplace
  129. features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFp')
  130. # Garage-like features NA in all. NA means No Garage
  131. for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageYrBlt', 'GarageCond'):
  132. features[col] = features[col].fillna('NoGrg')
  133. # GarageCars and GarageArea NA in pred. I suppose NA means 0
  134. for col in ('GarageCars', 'GarageArea'):
  135. features[col] = features[col].fillna(0.0)
  136. # SaleType NA in pred. filling with most popular values
  137. features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
  138. # PoolQC NA in all. NA means No Pool
  139. features['PoolQC'] = features['PoolQC'].fillna('NoPool')
  140. # MiscFeature NA in all. NA means None
  141. features['MiscFeature'] = features['MiscFeature'].fillna('None')
  142. # Fence NA in all. NA means no fence
  143. features['Fence'] = features['Fence'].fillna('NoFence')
  144. # BsmtHalfBath and BsmtFullBath NA means 0
  145. for col in ('BsmtHalfBath', 'BsmtFullBath'):
  146. features[col] = features[col].fillna(0)
  147. # Functional NA means Typ
  148. features['Functional'] = features['Functional'].fillna('Typ')
  149. # NA in Bsmt SF variables means not that type of basement, 0 square feet
  150. for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF'):
  151. features[col] = features[col].fillna(0)
  152. # NA in Exterior1st filled with the most common value
  153. features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
  154. # NA in Exterior2nd means No 2nd material
  155. features['Exterior2nd'] = features['Exterior2nd'].fillna('NoExt2nd')
  156. # Year and Month to categorical
  157. features['YrSold'] = features['YrSold'].astype(str)
  158. features['MoSold'] = features['MoSold'].astype(str)
  159. # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
  160. features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
  161. #features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
  162. ###################################################################################################
  163. # Let's rank those categorical features that can be understood to have an order
  164. # Criterion: give higher ranking to better feature values
  165. features = features.replace({'Street' : {'Grvl':1, 'Pave':2},
  166. 'Alley' : {'NoAccess':0, 'Grvl':1, 'Pave':2},
  167. 'LotShape' : {'I33':1, 'IR2':2, 'IR1':3, 'Reg':4},
  168. 'LandContour' : {'Low':1, 'HLS':2, 'Bnk':3, 'Lvl':4},
  169. 'LotConfig' : {'FR3':1, 'FR2':2, 'CulDSac':3, 'Corner':4, 'Inside':5},
  170. 'LandSlope' : {'Gtl':1, 'Mod':2, 'Sev':3},
  171. 'HouseStyle' : {'1Story':1, '1.5Fin':2, '1.5Unf':3, '2Story':4, '2.5Fin':5, '2.5Unf':6, 'SFoyer':7, 'SLvl':8},
  172. 'ExterQual' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  173. 'ExterCond' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  174. 'BsmtQual' : {'NoBsmt':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  175. 'BsmtCond' : {'NoBsmt':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  176. 'BsmtExposure' : {'NoBsmt':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4},
  177. 'BsmtFinType1' : {'NoBsmt':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6},
  178. 'BsmtFinType2' : {'NoBsmt':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6},
  179. 'HeatingQC' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  180. 'CentralAir' : {'N':0, 'Y':1},
  181. 'KitchenQual' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  182. 'Functional' : {'Sal':0, 'Sev':1, 'Maj2':2, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':6, 'Typ':7},
  183. 'FireplaceQu' : {'NoFp':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  184. 'GarageType' : {'NoGrg':0, 'Detchd':1, 'CarPort':2, 'BuiltIn':3, 'Basment':4, 'Attchd':5, '2Types':6},
  185. 'GarageFinish' : {'NoGrg':0, 'Unf':1, 'RFn':2, 'Fin':3},
  186. 'GarageQual' : {'NoGrg':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  187. 'GarageCond' : {'NoGrg':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  188. 'PavedDrive' : {'N':0, 'P':1, 'Y':2},
  189. 'PoolQC' : {'NoPool':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
  190. 'Fence' : {'NoFence':0, 'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}
  191. })
  192. ###################################################################################################
  193. # Box-cox transformation to most skewed features
  194. numeric_feats = features.dtypes[features.dtypes != 'object'].index
  195. # Check the skew of all numerical features
  196. skewed_feats = features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
  197. print("\nSkew in numerical features:")
  198. skewness = pd.DataFrame({'Skew' :skewed_feats})
  199. skewness.head(10)
  200. # Box-cox
  201. skewness = skewness[abs(skewness) > 0.75]
  202. print("There are {} skewed numerical features to Box Cox transform\n".format(skewness.shape[0]))
  203. from scipy.special import boxcox1p
  204. skewed_features = skewness.index
  205. lam = 0.15
  206. for feat in skewed_features:
  207. features[feat] = boxcox1p(features[feat], lam)
  208. # Label encoding to some categorical features
  209. categorical_features = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
  210. 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
  211. 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
  212. 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
  213. 'YrSold', 'MoSold')
  214. lbl = LabelEncoder()
  215. for col in categorical_features:
  216. lbl.fit(list(features[col].values))
  217. features[col] = lbl.transform(list(features[col].values))
  218. # Getting Dummies
  219. features = pd.get_dummies(features)
  220. # Splitting features
  221. train_features = features.loc['train'].select_dtypes(include=[np.number]).values
  222. test_features = features.loc['test'].select_dtypes(include=[np.number]).values
  223. # Validation function
  224. n_folds = 5
  225. def rmsle_cv(model):
  226. kf = KFold(n_folds, shuffle=True, random_state=101010).get_n_splits(train_features)
  227. rmse= np.sqrt(-cross_val_score(model, train_features, train_labels, scoring="neg_mean_squared_error", cv = kf))
  228. return(rmse)
  229. # Modelling
  230. enet_model = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=101010))
  231. print(enet_model)
  232. #score = rmsle_cv(enet_model)
  233. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  234. gb_model = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
  235. max_depth=4, max_features='sqrt',
  236. min_samples_leaf=15, min_samples_split=10,
  237. loss='huber', random_state =101010)
  238. print(gb_model)
  239. #score = rmsle_cv(gb_model)
  240. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  241. xgb_model = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0,
  242. learning_rate=0.05, max_depth=3,
  243. min_child_weight=1.7, n_estimators=2200,
  244. reg_alpha=0.9, reg_lambda=0.6,
  245. subsample=0.5, silent=1, seed=101010)
  246. print(xgb_model)
  247. #score = rmsle_cv(xgb_model)
  248. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  249. lasso_model = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=101010))
  250. print(lasso_model)
  251. #score = rmsle_cv(lasso_model)
  252. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  253. krr_model = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
  254. print(krr_model)
  255. #score = rmsle_cv(krr_model)
  256. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  257. # Now let's check how do the averaged models work
  258. averaged_models = AveragingModels(models = (gb_model, xgb_model, enet_model, lasso_model, krr_model))
  259. print("AVERAGED MODELS")
  260. score = rmsle_cv(averaged_models)
  261. print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  262. # Getting our SalePrice estimation
  263. averaged_models.fit(train_features, train_labels)
  264. final_labels = np.exp(averaged_models.predict(test_features))
  265. # Saving to CSV
  266. pd.DataFrame({'Id': test_ID, 'SalePrice': final_labels}).to_csv('submission-12.csv', index =False)