kaggle-16.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. # -*- coding: utf-8 -*-
  2. # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
  3. # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
  4. # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
  5. # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
  6. # Also, part of the preprocessing and modelling has been inspired by this kernel from Serigne
  7. # https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
  8. # And this kernel from juliencs has been pretty helpful too!
  9. # https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset
  10. # Adding needed libraries and reading data
  11. import pandas as pd
  12. import numpy as np
  13. import matplotlib.pyplot as plt
  14. import seaborn as sns
  15. from sklearn import ensemble, tree, linear_model, preprocessing
  16. from sklearn.preprocessing import LabelEncoder, RobustScaler
  17. from sklearn.linear_model import ElasticNet, Lasso
  18. from sklearn.kernel_ridge import KernelRidge
  19. from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
  20. from sklearn.model_selection import KFold, train_test_split, cross_val_score
  21. from sklearn.metrics import r2_score, mean_squared_error
  22. from sklearn.pipeline import make_pipeline
  23. from sklearn.utils import shuffle
  24. from scipy import stats
  25. from scipy.stats import norm, skew, boxcox
  26. from scipy.special import boxcox1p
  27. import xgboost as xgb
  28. import lightgbm as lgb
  29. import warnings
  30. warnings.filterwarnings('ignore')
  31. # Class AveragingModels
  32. # This class is Serigne's simplest way of stacking the prediction models, by
  33. # averaging them. We are going to use it as it represents the same that we have
  34. # been using in the late submissions, but this applies perfectly to rmsle_cv function.
  35. class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
  36. def __init__(self, models):
  37. self.models = models
  38. # we define clones of the original models to fit the data in
  39. def fit(self, X, y):
  40. self.models_ = [clone(x) for x in self.models]
  41. # Train cloned base models
  42. for model in self.models_:
  43. model.fit(X, y)
  44. return self
  45. #Now we do the predictions for cloned models and average them
  46. def predict(self, X):
  47. predictions = np.column_stack([
  48. model.predict(X) for model in self.models_
  49. ])
  50. return np.mean(predictions, axis=1)
  51. train = pd.read_csv("../../train.csv")
  52. test = pd.read_csv("../../test.csv")
  53. #Save the 'Id' column
  54. train_ID = train['Id']
  55. test_ID = test['Id']
  56. train.drop('Id', axis=1, inplace=True)
  57. test.drop('Id', axis=1, inplace=True)
  58. # Visualizing outliers
  59. fig, ax = plt.subplots()
  60. ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
  61. plt.ylabel('SalePrice', fontsize=13)
  62. plt.xlabel('GrLivArea', fontsize=13)
  63. #plt.show()
  64. # Now the outliers can be deleted
  65. train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)
  66. #Check the graphic again, making sure there are no outliers left
  67. fig, ax = plt.subplots()
  68. ax.scatter(train['GrLivArea'], train['SalePrice'])
  69. plt.ylabel('SalePrice', fontsize=13)
  70. plt.xlabel('GrLivArea', fontsize=13)
  71. #plt.show()
  72. #We use the numpy fuction log1p which applies log(1+x) to all elements of the column
  73. train["SalePrice"] = np.log1p(train["SalePrice"])
  74. #Check the new distribution
  75. sns.distplot(train['SalePrice'] , fit=norm);
  76. # Get the fitted parameters used by the function
  77. (mu, sigma) = norm.fit(train['SalePrice'])
  78. print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
  79. #Now plot the distribution
  80. plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
  81. loc='best')
  82. plt.ylabel('Frequency')
  83. plt.title('SalePrice distribution')
  84. #Get also the QQ-plot
  85. fig = plt.figure()
  86. res = stats.probplot(train['SalePrice'], plot=plt)
  87. #plt.show()
  88. # Splitting to features and labels
  89. train_labels = train.pop('SalePrice')
  90. # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
  91. features = pd.concat([train, test], keys=['train', 'test'])
  92. # Checking for missing data, showing every variable with at least one missing value in train set
  93. total_missing_data = features.isnull().sum().sort_values(ascending=False)
  94. missing_data_percent = (features.isnull().sum()/features.isnull().count()).sort_values(ascending=False)
  95. missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
  96. print(missing_data[missing_data['Percent']> 0])
  97. # Deleting non-interesting variables for this case study
  98. features.drop(['Utilities'], axis=1, inplace=True)
  99. # Imputing missing values and transforming certain columns
  100. # Converting OverallCond to str
  101. features.OverallCond = features.OverallCond.astype(str)
  102. # MSSubClass as str
  103. features['MSSubClass'] = features['MSSubClass'].astype(str)
  104. # MSZoning NA in pred. filling with most popular values
  105. features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
  106. # LotFrontage NA filling with median according to its OverallQual value
  107. median = features.groupby('OverallQual')['LotFrontage'].transform('median')
  108. features['LotFrontage'] = features['LotFrontage'].fillna(median)
  109. # Alley NA in all. NA means no access
  110. features['Alley'] = features['Alley'].fillna('NoAccess')
  111. # MasVnrArea NA filling with median according to its OverallQual value
  112. median = features.groupby('OverallQual')['MasVnrArea'].transform('median')
  113. features['MasVnrArea'] = features['MasVnrArea'].fillna(median)
  114. # MasVnrType NA in all. filling with most popular values
  115. features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
  116. # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
  117. # NA in all. NA means No basement
  118. for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
  119. features[col] = features[col].fillna('NoBsmt')
  120. # TotalBsmtSF NA in pred. I suppose NA means 0
  121. features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
  122. # Electrical NA in pred. filling with most popular values
  123. features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
  124. # KitchenAbvGr to categorical
  125. features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
  126. # KitchenQual NA in pred. filling with most popular values
  127. features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
  128. # FireplaceQu NA in all. NA means No Fireplace
  129. features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFp')
  130. # Garage-like features NA in all. NA means No Garage
  131. for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageYrBlt', 'GarageCond'):
  132. features[col] = features[col].fillna('NoGrg')
  133. # GarageCars and GarageArea NA in pred. I suppose NA means 0
  134. for col in ('GarageCars', 'GarageArea'):
  135. features[col] = features[col].fillna(0.0)
  136. # SaleType NA in pred. filling with most popular values
  137. features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
  138. # PoolQC NA in all. NA means No Pool
  139. features['PoolQC'] = features['PoolQC'].fillna('NoPool')
  140. # MiscFeature NA in all. NA means None
  141. features['MiscFeature'] = features['MiscFeature'].fillna('None')
  142. # Fence NA in all. NA means no fence
  143. features['Fence'] = features['Fence'].fillna('NoFence')
  144. # BsmtHalfBath and BsmtFullBath NA means 0
  145. for col in ('BsmtHalfBath', 'BsmtFullBath'):
  146. features[col] = features[col].fillna(0)
  147. # Functional NA means Typ
  148. features['Functional'] = features['Functional'].fillna('Typ')
  149. # NA in Bsmt SF variables means not that type of basement, 0 square feet
  150. for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF'):
  151. features[col] = features[col].fillna(0)
  152. # NA in Exterior1st filled with the most common value
  153. features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
  154. # NA in Exterior2nd means No 2nd material
  155. features['Exterior2nd'] = features['Exterior2nd'].fillna('NoExt2nd')
  156. # Year and Month to categorical
  157. features['YrSold'] = features['YrSold'].astype(str)
  158. features['MoSold'] = features['MoSold'].astype(str)
  159. # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
  160. features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
  161. #features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
  162. # Let's rank those categorical features that can be understood to have an order
  163. # Criterion: give higher ranking to better feature values
  164. features = features.replace({'Street' : {'Grvl':1, 'Pave':2},
  165. 'Alley' : {'NoAccess':0, 'Grvl':1, 'Pave':2},
  166. 'LotShape' : {'I33':1, 'IR2':2, 'IR1':3, 'Reg':4},
  167. 'LandContour' : {'Low':1, 'HLS':2, 'Bnk':3, 'Lvl':4},
  168. 'LotConfig' : {'FR3':1, 'FR2':2, 'CulDSac':3, 'Corner':4, 'Inside':5},
  169. 'LandSlope' : {'Gtl':1, 'Mod':2, 'Sev':3},
  170. 'HouseStyle' : {'1Story':1, '1.5Fin':2, '1.5Unf':3, '2Story':4, '2.5Fin':5, '2.5Unf':6, 'SFoyer':7, 'SLvl':8},
  171. 'ExterQual' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  172. 'ExterCond' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  173. 'BsmtQual' : {'NoBsmt':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  174. 'BsmtCond' : {'NoBsmt':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  175. 'BsmtExposure' : {'NoBsmt':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4},
  176. 'BsmtFinType1' : {'NoBsmt':0, 'Unf':1, 'LwQ':2, 'BLQ':3, 'Rec':4, 'ALQ':5, 'GLQ':6},
  177. 'BsmtFinType2' : {'NoBsmt':0, 'Unf':1, 'LwQ':2, 'BLQ':3, 'Rec':4, 'ALQ':5, 'GLQ':6},
  178. 'HeatingQC' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  179. 'CentralAir' : {'N':0, 'Y':1},
  180. 'KitchenQual' : {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  181. 'Functional' : {'Sal':0, 'Sev':1, 'Maj2':2, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':6, 'Typ':7},
  182. 'FireplaceQu' : {'NoFp':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  183. 'GarageType' : {'NoGrg':0, 'Detchd':1, 'CarPort':2, 'BuiltIn':3, 'Basment':4, 'Attchd':5, '2Types':6},
  184. 'GarageFinish' : {'NoGrg':0, 'Unf':1, 'RFn':2, 'Fin':3},
  185. 'GarageQual' : {'NoGrg':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  186. 'GarageCond' : {'NoGrg':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
  187. 'PavedDrive' : {'N':0, 'P':1, 'Y':2},
  188. 'PoolQC' : {'NoPool':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
  189. 'Fence' : {'NoFence':0, 'MnWw':1, 'MnPrv':2, 'GdWo':3, 'GdPrv':4}
  190. })
  191. # Now we try to simplify some of the ranked features, reducing its number of values
  192. features['SimplifiedOverallCond'] = features.OverallCond.replace({1:1, 2:1, 3:1, # bad
  193. 4:2, 5:2, 6:2, # average
  194. 7:3, 8:3, # good
  195. 9:4, 10:4 # excellent
  196. })
  197. features['SimplifiedHouseStyle'] = features.HouseStyle.replace({1:1, # 1 storey houses
  198. 2:2, 3:2, # 1.5 storey houses
  199. 4:3, # 2 storey houses
  200. 5:4, 6:4, # 2.5 storey houses
  201. 7:5, 8:5 # splitted houses
  202. })
  203. features['SimplifiedExterQual'] = features.ExterQual.replace({1:1, 2:1, # bad
  204. 3:2, 4:2, # good/average
  205. 5:3 # excellent
  206. })
  207. features['SimplifiedExterCond'] = features.ExterCond.replace({1:1, 2:1, # bad
  208. 3:2, 4:2, # good/average
  209. 5:3 # excellent
  210. })
  211. features['SimplifiedBsmtQual'] = features.BsmtQual.replace({1:1, 2:1, # bad, not necessary to check 0 value because will remain 0
  212. 3:2, 4:2, # good/average
  213. 5:3 # excellent
  214. })
  215. features['SimplifiedBsmtCond'] = features.BsmtCond.replace({1:1, 2:1, # bad
  216. 3:2, 4:2, # good/average
  217. 5:3 # excellent
  218. })
  219. features['SimplifiedBsmtExposure'] = features.BsmtExposure.replace({1:1, 2:1, # bad
  220. 3:2, 4:2 # good
  221. })
  222. features['SimplifiedBsmtFinType1'] = features.BsmtFinType1.replace({1:1, 2:1, 3:1, # bad
  223. 4:2, 5:2, # average
  224. 6:3 # good
  225. })
  226. features['SimplifiedBsmtFinType2'] = features.BsmtFinType2.replace({1:1, 2:1, 3:1, # bad
  227. 4:2, 5:2, # average
  228. 6:3 # good
  229. })
  230. features['SimplifiedHeatingQC'] = features.HeatingQC.replace({1:1, 2:1, # bad
  231. 3:2, 4:2, # good/average
  232. 5:3 # excellent
  233. })
  234. features['SimplifiedKitchenQual'] = features.KitchenQual.replace({1:1, 2:1, # bad
  235. 3:2, 4:2, # good/average
  236. 5:3 # excellent
  237. })
  238. features['SimplifiedFunctional'] = features.Functional.replace({0:0, 1:0, # really bad
  239. 2:1, 3:1, # quite bad
  240. 4:2, 5:2, 6:2, # small deductions
  241. 7:3 # working fine
  242. })
  243. features['SimplifiedFireplaceQu'] = features.FireplaceQu.replace({1:1, 2:1, # bad
  244. 3:2, 4:2, # good/average
  245. 5:3 # excellent
  246. })
  247. features['SimplifiedGarageQual'] = features.GarageQual.replace({1:1, 2:1, # bad
  248. 3:2, 4:2, # good/average
  249. 5:3 # excellent
  250. })
  251. features['SimplifiedGarageCond'] = features.GarageCond.replace({1:1, 2:1, # bad
  252. 3:2, 4:2, # good/average
  253. 5:3 # excellent
  254. })
  255. features['SimplifiedPoolQC'] = features.PoolQC.replace({1:1, 2:1, # average
  256. 3:2, 4:2 # good
  257. })
  258. features['SimplifiedFence'] = features.Fence.replace({1:1, 2:1, # bad
  259. 3:2, 4:2 # good
  260. })
  261. # Now, let's combine some features to get newer and cooler features
  262. # Overall Score of the house (and simplified)
  263. features['OverallScore'] = features['OverallQual'] * features['OverallCond']
  264. features['SimplifiedOverallScore'] = features['OverallQual'] * features['SimplifiedOverallCond']
  265. # Overall Score of the garage (and simplified garage)
  266. features['GarageScore'] = features['GarageQual'] * features['GarageCond']
  267. features['SimplifiedGarageScore'] = features['SimplifiedGarageQual'] * features['SimplifiedGarageCond']
  268. # Overall Score of the exterior (and simplified exterior)
  269. features['ExterScore'] = features['ExterQual'] * features['ExterCond']
  270. features['SimplifiedExterScore'] = features['SimplifiedExterQual'] * features['SimplifiedExterCond']
  271. # Overall Score of the pool (and simplified pool)
  272. features['PoolScore'] = features['PoolQC'] * features['PoolArea']
  273. features['SimplifiedPoolScore'] = features['SimplifiedPoolQC'] * features['PoolArea']
  274. # Overall Score of the kitchens (and simplified kitchens)
  275. features['KitchenScore'] = features['KitchenQual'] * features['KitchenAbvGr']
  276. features['SimplifiedKitchenScore'] = features['SimplifiedKitchenQual'] * features['KitchenAbvGr']
  277. # Overall Score of the fireplaces (and simplified fireplaces)
  278. features['FireplaceScore'] = features['FireplaceQu'] * features['Fireplaces']
  279. features['SimplifiedFireplaceScore'] = features['SimplifiedFireplaceQu'] * features['Fireplaces']
  280. # Total number of bathrooms
  281. features['TotalBaths'] = features['FullBath'] + (0.5*features['HalfBath']) + features['BsmtFullBath'] + (0.5*features['BsmtHalfBath'])
  282. ###################################################################################################
  283. # Let's check the most correlated features to SalePrice and make some transformations to them
  284. temporal_train_features = features.loc['train'].select_dtypes(include=[np.number])
  285. temporal_train_features = pd.concat([temporal_train_features, train_labels], axis=1)
  286. correlated = temporal_train_features.corr()
  287. correlated.sort_values(["SalePrice"], ascending = False, inplace = True)
  288. print("\nMost correlated features to SalePrice:")
  289. print(correlated.SalePrice[0:10])
  290. # Now we'll get some new features
  291. features['SquaredOverallQual'] = features['OverallQual'] ** 2
  292. features['CubedOverallQual'] = features['OverallQual'] ** 3
  293. features['SquaredTotalSF'] = features['TotalSF'] ** 2
  294. features['CubedTotalSF'] = features['TotalSF'] ** 3
  295. features['SquaredGrLivArea'] = features['GrLivArea'] ** 2
  296. features['CubedGrLivArea'] = features['GrLivArea'] ** 3
  297. features['SquaredExterQual'] = features['ExterQual'] ** 2
  298. features['CubedExterQual'] = features['ExterQual'] ** 3
  299. features['SquaredGarageCars'] = features['GarageCars'] ** 2
  300. features['CubedGarageCars'] = features['GarageCars'] ** 3
  301. ###################################################################################################
  302. # Box-cox transformation to most skewed features
  303. numeric_feats = features.dtypes[features.dtypes != 'object'].index
  304. # Check the skew of all numerical features
  305. skewed_feats = features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
  306. print("\nSkew in numerical features:")
  307. skewness = pd.DataFrame({'Skew' :skewed_feats})
  308. skewness.head(10)
  309. # Box-cox
  310. skewness = skewness[abs(skewness) > 0.75]
  311. print("There are {} skewed numerical features to Box Cox transform\n".format(skewness.shape[0]))
  312. from scipy.special import boxcox1p
  313. skewed_features = skewness.index
  314. lam = 0.15
  315. for feat in skewed_features:
  316. features[feat] = boxcox1p(features[feat], lam)
  317. # Label encoding to some categorical features
  318. categorical_features = ('FireplaceQu', 'SimplifiedFireplaceQu', 'BsmtQual', 'SimplifiedBsmtCond', 'BsmtCond',
  319. 'SimplifiedBsmtCond', 'GarageQual', 'SimplifiedGarageQual', 'GarageCond', 'SimplifiedGarageCond',
  320. 'ExterQual', 'SimplifiedExterQual', 'ExterCond', 'SimplifiedExterCond', 'HeatingQC', 'SimplifiedHeatingQC',
  321. 'PoolQC', 'SimplifiedPoolQC', 'KitchenQual', 'SimplifiedKitchenQual', 'BsmtFinType1', 'SimplifiedBsmtFinType1',
  322. 'BsmtFinType2', 'SimplifiedBsmtFinType2', 'Functional', 'SimplifiedFunctional', 'Fence', 'SimplifiedFence',
  323. 'BsmtExposure', 'SimplifiedBsmtExposure', 'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street',
  324. 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'SimplifiedOverallCond', 'YrSold', 'MoSold')
  325. lbl = LabelEncoder()
  326. for col in categorical_features:
  327. lbl.fit(list(features[col].values))
  328. features[col] = lbl.transform(list(features[col].values))
  329. # Getting Dummies
  330. features = pd.get_dummies(features)
  331. # Splitting features
  332. train_features = features.loc['train'].select_dtypes(include=[np.number]).values
  333. test_features = features.loc['test'].select_dtypes(include=[np.number]).values
  334. # Validation function
  335. n_folds = 5
  336. def rmsle_cv(model):
  337. kf = KFold(n_folds, shuffle=True, random_state=101010).get_n_splits(train_features)
  338. rmse= np.sqrt(-cross_val_score(model, train_features, train_labels, scoring="neg_mean_squared_error", cv = kf))
  339. return(rmse)
  340. # Modelling
  341. enet_model = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=101010))
  342. print(enet_model)
  343. #score = rmsle_cv(enet_model)
  344. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  345. gb_model = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
  346. max_depth=4, max_features='sqrt',
  347. min_samples_leaf=15, min_samples_split=10,
  348. loss='huber', random_state =101010)
  349. print(gb_model)
  350. #score = rmsle_cv(gb_model)
  351. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  352. xgb_model = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0,
  353. learning_rate=0.05, max_depth=3,
  354. min_child_weight=1.7, n_estimators=2200,
  355. reg_alpha=0.9, reg_lambda=0.6,
  356. subsample=0.5, silent=1, seed=101010)
  357. print(xgb_model)
  358. #score = rmsle_cv(xgb_model)
  359. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  360. lasso_model = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=101010))
  361. print(lasso_model)
  362. #score = rmsle_cv(lasso_model)
  363. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  364. krr_model = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
  365. print(krr_model)
  366. #score = rmsle_cv(krr_model)
  367. #print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  368. # Now let's check how do the averaged models work
  369. averaged_models = AveragingModels(models = (gb_model, xgb_model, enet_model, lasso_model, krr_model))
  370. print("AVERAGED MODELS")
  371. score = rmsle_cv(averaged_models)
  372. print("\nRMSLE: {:.4f} (+/- {:.4f})\n".format(score.mean(), score.std()))
  373. # Getting our SalePrice estimation
  374. averaged_models.fit(train_features, train_labels)
  375. final_labels = np.exp(averaged_models.predict(test_features))
  376. # Saving to CSV
  377. pd.DataFrame({'Id': test_ID, 'SalePrice': final_labels}).to_csv('submission-16.csv', index =False)