kaggle-18.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. # -*- coding: utf-8 -*-
  2. # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
  3. # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
  4. # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
  5. # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
  6. # Also, part of the preprocessing and modelling has been inspired by this kernel from Serigne
  7. # https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
  8. # And this kernel from juliencs has been pretty helpful too!
  9. # https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset
  10. # Adding needed libraries and reading data
  11. import pandas as pd
  12. import numpy as np
  13. import matplotlib.pyplot as plt
  14. import seaborn as sns
  15. from sklearn import ensemble, tree, linear_model, preprocessing
  16. from sklearn.preprocessing import LabelEncoder, RobustScaler
  17. from sklearn.linear_model import ElasticNet, Lasso
  18. from sklearn.kernel_ridge import KernelRidge
  19. from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
  20. from sklearn.model_selection import KFold, train_test_split, cross_val_score
  21. from sklearn.metrics import r2_score, mean_squared_error
  22. from sklearn.pipeline import make_pipeline
  23. from sklearn.utils import shuffle
  24. from scipy import stats
  25. from scipy.stats import norm, skew, boxcox
  26. from scipy.special import boxcox1p
  27. import xgboost as xgb
  28. import lightgbm as lgb
  29. import warnings
  30. warnings.filterwarnings('ignore')
  31. # Class AveragingModels
  32. # This class is Serigne's simplest way of stacking the prediction models, by
  33. # averaging them. We are going to use it as it represents the same that we have
  34. # been using in the late submissions, but this applies perfectly to rmsle_cv function.
  35. class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
  36. def __init__(self, models):
  37. self.models = models
  38. # we define clones of the original models to fit the data in
  39. def fit(self, X, y):
  40. self.models_ = [clone(x) for x in self.models]
  41. # Train cloned base models
  42. for model in self.models_:
  43. model.fit(X, y)
  44. return self
  45. #Now we do the predictions for cloned models and average them
  46. def predict(self, X):
  47. predictions = np.column_stack([
  48. model.predict(X) for model in self.models_
  49. ])
  50. return np.mean(predictions, axis=1)
  51. train = pd.read_csv("../../train.csv")
  52. test = pd.read_csv("../../test.csv")
  53. #Save the 'Id' column
  54. train_ID = train['Id']
  55. test_ID = test['Id']
  56. train.drop('Id', axis=1, inplace=True)
  57. test.drop('Id', axis=1, inplace=True)
  58. # Visualizing outliers
  59. fig, ax = plt.subplots()
  60. ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
  61. plt.ylabel('SalePrice', fontsize=13)
  62. plt.xlabel('GrLivArea', fontsize=13)
  63. #plt.show()
  64. # Now the outliers can be deleted
  65. train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)
  66. #Check the graphic again, making sure there are no outliers left
  67. fig, ax = plt.subplots()
  68. ax.scatter(train['GrLivArea'], train['SalePrice'])
  69. plt.ylabel('SalePrice', fontsize=13)
  70. plt.xlabel('GrLivArea', fontsize=13)
  71. #plt.show()
  72. #We use the numpy fuction log1p which applies log(1+x) to all elements of the column
  73. train["SalePrice"] = np.log1p(train["SalePrice"])
  74. #Check the new distribution
  75. sns.distplot(train['SalePrice'] , fit=norm);
  76. # Get the fitted parameters used by the function
  77. (mu, sigma) = norm.fit(train['SalePrice'])
  78. print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
  79. #Now plot the distribution
  80. plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
  81. loc='best')
  82. plt.ylabel('Frequency')
  83. plt.title('SalePrice distribution')
  84. #Get also the QQ-plot
  85. fig = plt.figure()
  86. res = stats.probplot(train['SalePrice'], plot=plt)
  87. #plt.show()
  88. # Splitting to features and labels
  89. train_labels = train.pop('SalePrice')
  90. # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
  91. features = pd.concat([train, test], keys=['train', 'test'])
  92. # Checking for missing data, showing every variable with at least one missing value in train set
  93. total_missing_data = features.isnull().sum().sort_values(ascending=False)
  94. missing_data_percent = (features.isnull().sum()/features.isnull().count()).sort_values(ascending=False)
  95. missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
  96. print(missing_data[missing_data['Percent']> 0])
  97. # Deleting non-interesting variables for this case study
  98. features.drop(['Utilities'], axis=1, inplace=True)
  99. # Imputing missing values and transforming certain columns
  100. # Converting OverallCond to str
  101. features.OverallCond = features.OverallCond.astype(str)
  102. # MSSubClass as str
  103. features['MSSubClass'] = features['MSSubClass'].astype(str)
  104. # MSZoning NA in pred. filling with most popular values
  105. features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
  106. # LotFrontage NA filling with median according to its OverallQual value
  107. median = features.groupby('OverallQual')['LotFrontage'].transform('median')
  108. features['LotFrontage'] = features['LotFrontage'].fillna(median)
  109. # Alley NA in all. NA means no access
  110. features['Alley'] = features['Alley'].fillna('NoAccess')
  111. # MasVnrArea NA filling with median according to its OverallQual value
  112. median = features.groupby('OverallQual')['MasVnrArea'].transform('median')
  113. features['MasVnrArea'] = features['MasVnrArea'].fillna(median)
  114. # MasVnrType NA in all. filling with most popular values
  115. features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
  116. # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
  117. # NA in all. NA means No basement
  118. for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
  119. features[col] = features[col].fillna('NoBsmt')
  120. # TotalBsmtSF NA in pred. I suppose NA means 0
  121. features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
  122. # Electrical NA in pred. filling with most popular values
  123. features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
  124. # KitchenAbvGr to categorical
  125. features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
  126. # KitchenQual NA in pred. filling with most popular values
  127. features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
  128. # FireplaceQu NA in all. NA means No Fireplace
  129. features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFp')
  130. # Garage-like features NA in all. NA means No Garage
  131. for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageYrBlt', 'GarageCond'):
  132. features[col] = features[col].fillna('NoGrg')
  133. # GarageCars and GarageArea NA in pred. I suppose NA means 0
  134. for col in ('GarageCars', 'GarageArea'):
  135. features[col] = features[col].fillna(0.0)
  136. # SaleType NA in pred. filling with most popular values
  137. features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
  138. # PoolQC NA in all. NA means No Pool
  139. features['PoolQC'] = features['PoolQC'].fillna('NoPool')
  140. # MiscFeature NA in all. NA means None
  141. features['MiscFeature'] = features['MiscFeature'].fillna('None')
  142. # Fence NA in all. NA means no fence
  143. features['Fence'] = features['Fence'].fillna('NoFence')
  144. # BsmtHalfBath and BsmtFullBath NA means 0
  145. for col in ('BsmtHalfBath', 'BsmtFullBath'):
  146. features[col] = features[col].fillna(0)
  147. # Functional NA means Typ
  148. features['Functional'] = features['Functional'].fillna('Typ')
  149. # NA in Bsmt SF variables means not that type of basement, 0 square feet
  150. for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF'):
  151. features[col] = features[col].fillna(0)
  152. # NA in Exterior1st filled with the most common value
  153. features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
  154. # NA in Exterior2nd means No 2nd material
  155. features['Exterior2nd'] = features['Exterior2nd'].fillna('NoExt2nd')
  156. # Year and Month to categorical
  157. features['YrSold'] = features['YrSold'].astype(str)
  158. features['MoSold'] = features['MoSold'].astype(str)
  159. # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
  160. features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
  161. #features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
  162. # Let's rank those categorical features that can be understood to have an order
  163. # Criterion: give higher ranking to better feature values
  164. features = features.replace({'Street' : {'Grvl':1, 'Pave':2},
  165. 'Alley' : {'NoAccess':0, 'Grvl':1, 'Pave':2},
  166. 'LotShape' : {'I33':1, 'IR2':2, 'IR1':3, 'Reg':4},
  167. 'LandContour' : {'Low':1, 'HLS':2, 'Bnk':3, 'Lvl':4},
  168. 'LotConfig' : {'FR3':1, 'FR2':2, 'CulDSac':3, 'Corner':4, 'Inside':5},
  169. 'LandSlope' : {'Gtl':1, 'Mod':2, 'Sev':3},
  170. 'HouseStyle' : {'1Story':1, '1.5Fin':2, '1.5Unf':3, '2Story':4, '2.5Fin':5, '2.5Unf':6, 'SFoyer':7, 'SLvl':8},
  171. 'ExterQual' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  172. 'ExterCond' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  173. 'BsmtQual' : {'NoBsmt':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  174. 'BsmtCond' : {'NoBsmt':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  175. 'BsmtExposure' : {'NoBsmt':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4},
  176. 'BsmtFinType1' : {'NoBsmt':0, 'Unf':1, 'LwQ':2, 'BLQ':3, 'Rec':4, 'ALQ':5, 'GLQ':6},
  177. 'BsmtFinType2' : {'NoBsmt':0, 'Unf':1, 'LwQ':2, 'BLQ':3, 'Rec':4, 'ALQ':5, 'GLQ':6},
  178. 'HeatingQC' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  179. 'CentralAir' : {'N':0, 'Y':1},
  180. 'KitchenQual' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  181. 'Functional' : {'Sal':0, 'Sev':1, 'Maj2':2, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':6, 'Typ':7},
  182. 'FireplaceQu' : {'NoFp':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  183. 'GarageType' : {'NoGrg':0, 'Detchd':1, 'CarPort':2, 'BuiltIn':3, 'Basment':4, 'Attchd':5, '2Types':6},
  184. 'GarageFinish' : {'NoGrg':0, 'Unf':1, 'RFn':2, 'Fin':3},
  185. 'GarageQual' : {'NoGrg':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  186. 'GarageCond' : {'NoGrg':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  187. 'PavedDrive' : {'N':0, 'P':1, 'Y':2},
  188. 'PoolQC' : {'NoPool':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
  189. 'Fence' : {'NoFence':0, 'MnWw':1, 'MnPrv':2, 'GdWo':3, 'GdPrv':4}
  190. })
  191. ###################################################################################################
  192. # Now we try to simplify some of the ranked features, reducing its number of values
  193. features['SimplifiedOverallCond'] = features.OverallCond.replace({1:1, 2:1, 3:1, # bad
  194. 4:2, 5:2, 6:2, # average
  195. 7:3, 8:3, # good
  196. 9:4, 10:4 # excellent
  197. })
  198. features['SimplifiedHouseStyle'] = features.HouseStyle.replace({1:1, # 1 storey houses
  199. 2:2, 3:2, # 1.5 storey houses
  200. 4:3, # 2 storey houses
  201. 5:4, 6:4, # 2.5 storey houses
  202. 7:5, 8:5 # splitted houses
  203. })
  204. features['SimplifiedExterQual'] = features.ExterQual.replace({1:1, 2:1, # bad
  205. 3:2, 4:2, # good/average
  206. 5:3 # excellent
  207. })
  208. features['SimplifiedExterCond'] = features.ExterCond.replace({1:1, 2:1, # bad
  209. 3:2, 4:2, # good/average
  210. 5:3 # excellent
  211. })
  212. features['SimplifiedBsmtQual'] = features.BsmtQual.replace({1:1, 2:1, # bad, not necessary to check 0 value because will remain 0
  213. 3:2, 4:2, # good/average
  214. 5:3 # excellent
  215. })
  216. features['SimplifiedBsmtCond'] = features.BsmtCond.replace({1:1, 2:1, # bad
  217. 3:2, 4:2, # good/average
  218. 5:3 # excellent
  219. })
  220. features['SimplifiedBsmtExposure'] = features.BsmtExposure.replace({1:1, 2:1, # bad
  221. 3:2, 4:2 # good
  222. })
  223. features['SimplifiedBsmtFinType1'] = features.BsmtFinType1.replace({1:1, 2:1, 3:1, # bad
  224. 4:2, 5:2, # average
  225. 6:3 # good
  226. })
  227. features['SimplifiedBsmtFinType2'] = features.BsmtFinType2.replace({1:1, 2:1, 3:1, # bad
  228. 4:2, 5:2, # average
  229. 6:3 # good
  230. })
  231. features['SimplifiedHeatingQC'] = features.HeatingQC.replace({1:1, 2:1, # bad
  232. 3:2, 4:2, # good/average
  233. 5:3 # excellent
  234. })
  235. features['SimplifiedKitchenQual'] = features.KitchenQual.replace({1:1, 2:1, # bad
  236. 3:2, 4:2, # good/average
  237. 5:3 # excellent
  238. })
  239. features['SimplifiedFunctional'] = features.Functional.replace({0:0, 1:0, # really bad
  240. 2:1, 3:1, # quite bad
  241. 4:2, 5:2, 6:2, # small deductions
  242. 7:3 # working fine
  243. })
  244. features['SimplifiedFireplaceQu'] = features.FireplaceQu.replace({1:1, 2:1, # bad
  245. 3:2, 4:2, # good/average
  246. 5:3 # excellent
  247. })
  248. features['SimplifiedGarageQual'] = features.GarageQual.replace({1:1, 2:1, # bad
  249. 3:2, 4:2, # good/average
  250. 5:3 # excellent
  251. })
  252. features['SimplifiedGarageCond'] = features.GarageCond.replace({1:1, 2:1, # bad
  253. 3:2, 4:2, # good/average
  254. 5:3 # excellent
  255. })
  256. features['SimplifiedPoolQC'] = features.PoolQC.replace({1:1, 2:1, # average
  257. 3:2, 4:2 # good
  258. })
  259. features['SimplifiedFence'] = features.Fence.replace({1:1, 2:1, # bad
  260. 3:2, 4:2 # good
  261. })
  262. # Now, let's combine some features to get newer and cooler features
  263. # Overall Score of the house (and simplified)
  264. features['OverallScore'] = features['OverallQual'] * features['OverallCond']
  265. features['SimplifiedOverallScore'] = features['OverallQual'] * features['SimplifiedOverallCond']
  266. # Overall Score of the garage (and simplified garage)
  267. features['GarageScore'] = features['GarageQual'] * features['GarageCond']
  268. features['SimplifiedGarageScore'] = features['SimplifiedGarageQual'] * features['SimplifiedGarageCond']
  269. # Overall Score of the exterior (and simplified exterior)
  270. features['ExterScore'] = features['ExterQual'] * features['ExterCond']
  271. features['SimplifiedExterScore'] = features['SimplifiedExterQual'] * features['SimplifiedExterCond']
  272. # Overall Score of the pool (and simplified pool)
  273. features['PoolScore'] = features['PoolQC'] * features['PoolArea']
  274. features['SimplifiedPoolScore'] = features['SimplifiedPoolQC'] * features['PoolArea']
  275. # Overall Score of the kitchens (and simplified kitchens)
  276. features['KitchenScore'] = features['KitchenQual'] * features['KitchenAbvGr']
  277. features['SimplifiedKitchenScore'] = features['SimplifiedKitchenQual'] * features['KitchenAbvGr']
  278. # Overall Score of the fireplaces (and simplified fireplaces)
  279. features['FireplaceScore'] = features['FireplaceQu'] * features['Fireplaces']
  280. features['SimplifiedFireplaceScore'] = features['SimplifiedFireplaceQu'] * features['Fireplaces']
  281. # Total number of bathrooms
  282. features['TotalBaths'] = features['FullBath'] + (0.5*features['HalfBath']) + features['BsmtFullBath'] + (0.5*features['BsmtHalfBath'])
  283. ###################################################################################################
  284. # Box-cox transformation to most skewed features
  285. numeric_feats = features.dtypes[features.dtypes != 'object'].index
  286. # Check the skew of all numerical features
  287. skewed_feats = features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
  288. print("\nSkew in numerical features:")
  289. skewness = pd.DataFrame({'Skew' :skewed_feats})
  290. skewness.head(10)
  291. # Box-cox
  292. skewness = skewness[abs(skewness) > 0.75]
  293. print("There are {} skewed numerical features to Box Cox transform\n".format(skewness.shape[0]))
  294. from scipy.special import boxcox1p
  295. skewed_features = skewness.index
  296. lam = 0.15
  297. for feat in skewed_features:
  298. features[feat] = boxcox1p(features[feat], lam)
  299. # Label encoding to some categorical features
  300. categorical_features = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
  301. 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
  302. 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
  303. 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
  304. 'YrSold', 'MoSold')
  305. lbl = LabelEncoder()
  306. for col in categorical_features:
  307. lbl.fit(list(features[col].values))
  308. features[col] = lbl.transform(list(features[col].values))
  309. # Getting Dummies
  310. features = pd.get_dummies(features)
  311. # Splitting features
  312. train_features = features.loc['train'].select_dtypes(include=[np.number]).values
  313. test_features = features.loc['test'].select_dtypes(include=[np.number]).values
  314. # Validation function
  315. n_folds = 5
  316. def rmsle_cv(model):
  317. kf = KFold(n_folds, shuffle=True, random_state=101010).get_n_splits(train_features)
  318. rmse= np.sqrt(-cross_val_score(model, train_features, train_labels, scoring="neg_mean_squared_error", cv = kf))
  319. return(rmse)
  320. # Modelling
  321. enet_model = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0002, l1_ratio=.9, random_state=101010))
  322. print(enet_model)
  323. #score = rmsle_cv(enet_model)
  324. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  325. gb_model = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.02,
  326. max_depth=3, max_features='sqrt',
  327. min_samples_leaf=15, min_samples_split=10,
  328. loss='huber', random_state =101010)
  329. print(gb_model)
  330. #score = rmsle_cv(gb_model)
  331. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  332. lasso_model = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=101010))
  333. print(lasso_model)
  334. #score = rmsle_cv(lasso_model)
  335. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  336. # Now let's check how do the averaged models work
  337. averaged_models = AveragingModels(models = (gb_model, enet_model, lasso_model))
  338. print("AVERAGED MODELS")
  339. score = rmsle_cv(averaged_models)
  340. print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  341. # Getting our SalePrice estimation
  342. averaged_models.fit(train_features, train_labels)
  343. final_labels = np.exp(averaged_models.predict(test_features))
  344. # Saving to CSV
  345. pd.DataFrame({'Id': test_ID, 'SalePrice': final_labels}).to_csv('submission-18.csv', index =False)