kaggle-8.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. # -*- coding: utf-8 -*-
  2. # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
  3. # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
  4. # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
  5. # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
  6. # Also, part of the preprocessing has been inspired by this kernel from Serigne
  7. # https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
  8. # Adding needed libraries and reading data
  9. import pandas as pd
  10. import numpy as np
  11. import matplotlib.pyplot as plt
  12. import seaborn as sns
  13. from sklearn import ensemble, tree, linear_model, preprocessing
  14. from sklearn.model_selection import train_test_split, cross_val_score
  15. from sklearn.metrics import r2_score, mean_squared_error
  16. from sklearn.utils import shuffle
  17. from scipy import stats
  18. from scipy.stats import norm, skew, boxcox
  19. from scipy.special import boxcox1p
  20. import xgboost as xgb
  21. import warnings
  22. warnings.filterwarnings('ignore')
  23. train = pd.read_csv("../../train.csv")
  24. test = pd.read_csv("../../test.csv")
  25. # Prints R2 and RMSE scores
  26. def get_score(prediction, labels):
  27. print('R2: {}'.format(r2_score(prediction, labels)))
  28. print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels))))
  29. print('RMSLE: {}'.format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean())))
  30. # Shows scores for train and validation sets
  31. def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
  32. prediction_train = estimator.predict(x_trn)
  33. # Printing estimator
  34. print(estimator)
  35. # Printing train scores
  36. get_score(prediction_train, y_trn)
  37. prediction_test = estimator.predict(x_tst)
  38. # Printing test scores
  39. print("Test")
  40. get_score(prediction_test, y_tst)
  41. # Deleting outliers
  42. train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
  43. #We use the numpy fuction log1p which applies log(1+x) to all elements of the column
  44. train["SalePrice"] = np.log1p(train["SalePrice"])
  45. #Check the new distribution
  46. sns.distplot(train['SalePrice'] , fit=norm);
  47. # Get the fitted parameters used by the function
  48. (mu, sigma) = norm.fit(train['SalePrice'])
  49. print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
  50. #Now plot the distribution
  51. plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
  52. loc='best')
  53. plt.ylabel('Frequency')
  54. plt.title('SalePrice distribution')
  55. #Get also the QQ-plot
  56. fig = plt.figure()
  57. res = stats.probplot(train['SalePrice'], plot=plt)
  58. #plt.show()
  59. # Splitting to features and lables and deleting variables I don't need
  60. train_labels = train.pop('SalePrice')
  61. # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
  62. features = pd.concat([train, test], keys=['train', 'test'])
  63. features.drop(['Utilities'], axis=1, inplace=True)
  64. # Checking for missing data, showing every variable with at least one missing value in train set
  65. total_missing_data = features.isnull().sum().sort_values(ascending=False)
  66. missing_data_percent = (features.isnull().sum()/features.isnull().count()).sort_values(ascending=False)
  67. missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
  68. print(missing_data[missing_data['Percent']> 0])
  69. # Converting OverallCond and OverallQual to str
  70. features.OverallCond = features.OverallCond.astype(str)
  71. features.OverallQual = features.OverallQual.astype(str)
  72. # MSSubClass as str
  73. features['MSSubClass'] = features['MSSubClass'].astype(str)
  74. # MSZoning NA in pred. filling with most popular values
  75. features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
  76. # LotFrontage NA filling with median according to its OverallQual value
  77. median = features.groupby('OverallQual')['LotFrontage'].transform('median')
  78. features['LotFrontage'] = features['LotFrontage'].fillna(median)
  79. # Alley NA in all. NA means no access
  80. features['Alley'] = features['Alley'].fillna('NOACCESS')
  81. # MasVnrArea NA filling with median according to its OverallQual value
  82. median = features.groupby('OverallQual')['MasVnrArea'].transform('median')
  83. features['MasVnrArea'] = features['MasVnrArea'].fillna(median)
  84. # MasVnrType NA in all. filling with most popular values
  85. features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
  86. # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
  87. # NA in all. NA means No basement
  88. for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
  89. features[col] = features[col].fillna('NoBSMT')
  90. # TotalBsmtSF NA in pred. I suppose NA means 0
  91. features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
  92. # Electrical NA in pred. filling with most popular values
  93. features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
  94. # KitchenAbvGr to categorical
  95. features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
  96. # KitchenQual NA in pred. filling with most popular values
  97. features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
  98. # FireplaceQu NA in all. NA means No Fireplace
  99. features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP')
  100. # Garage-like features NA in all. NA means No Garage
  101. for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageYrBlt', 'GarageCond'):
  102. features[col] = features[col].fillna('NoGRG')
  103. # GarageCars and GarageArea NA in pred. I suppose NA means 0
  104. for col in ('GarageCars', 'GarageArea'):
  105. features[col] = features[col].fillna(0.0)
  106. # SaleType NA in pred. filling with most popular values
  107. features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
  108. # PoolQC NA in all. NA means No Pool
  109. features['PoolQC'] = features['PoolQC'].fillna('NoPool')
  110. # MiscFeature NA in all. NA means None
  111. features['MiscFeature'] = features['MiscFeature'].fillna('None')
  112. # Fence NA in all. NA means no fence
  113. features['Fence'] = features['Fence'].fillna('NoFence')
  114. # BsmtHalfBath and BsmtFullBath NA means 0
  115. for col in ('BsmtHalfBath', 'BsmtFullBath'):
  116. features[col] = features[col].fillna(0)
  117. # Functional NA means Typ
  118. features['Functional'] = features['Functional'].fillna('Typ')
  119. # NA in Bsmt SF variables means not that type of basement, 0 square feet
  120. for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF'):
  121. features[col] = features[col].fillna(0)
  122. # NA in Exterior1st and Exterior2nd filled with the most common value
  123. for col in ('Exterior1st', 'Exterior2nd'):
  124. features[col] = features[col].fillna(features[col].mode()[0])
  125. # Year and Month to categorical
  126. features['YrSold'] = features['YrSold'].astype(str)
  127. features['MoSold'] = features['MoSold'].astype(str)
  128. # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
  129. features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
  130. features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
  131. # Box-cox transformation to most skewed features
  132. numeric_features = features.dtypes[features.dtypes != "object"].index
  133. # Check the skew of all numerical features
  134. skewed_features = features[numeric_features].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
  135. print("\nSkew in numerical features: \n")
  136. skewness = pd.DataFrame({'Skew' :skewed_features})
  137. print(skewness.head(10))
  138. # Box-cox
  139. skewness = skewness[abs(skewness) > 0.75]
  140. skewed_features = skewness.index
  141. lam = 0.15
  142. for feat in skewed_features:
  143. features[feat] = boxcox1p(features[feat], lam)
  144. # Getting Dummies from Condition1 and Condition2
  145. conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']])
  146. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(conditions))),
  147. index=features.index, columns=conditions)
  148. for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])):
  149. dummies.ix[i, cond] = 1
  150. features = pd.concat([features, dummies.add_prefix('Condition_')], axis=1)
  151. features.drop(['Condition1', 'Condition2'], axis=1, inplace=True)
  152. # Getting Dummies from Exterior1st and Exterior2nd
  153. exteriors = set([x for x in features['Exterior1st']] + [x for x in features['Exterior2nd']])
  154. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(exteriors))),
  155. index=features.index, columns=exteriors)
  156. for i, ext in enumerate(zip(features['Exterior1st'], features['Exterior2nd'])):
  157. dummies.ix[i, ext] = 1
  158. features = pd.concat([features, dummies.add_prefix('Exterior_')], axis=1)
  159. features.drop(['Exterior1st', 'Exterior2nd'], axis=1, inplace=True)
  160. # Getting Dummies from all other categorical vars
  161. for col in features.dtypes[features.dtypes == 'object'].index:
  162. for_dummy = features.pop(col)
  163. features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)
  164. ### Splitting features
  165. train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  166. test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  167. ### Shuffling train sets
  168. train_features, train_labels = shuffle(train_features, train_labels, random_state = 5)
  169. ### Splitting
  170. x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
  171. '''
  172. Elastic Net
  173. '''
  174. ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10],
  175. l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train, y_train)
  176. train_test(ENSTest, x_train, x_test, y_train, y_test)
  177. # Average R2 score and standard deviation of 5-fold cross-validation
  178. scores = cross_val_score(ENSTest, train_features, train_labels, cv=5)
  179. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  180. '''
  181. Gradient Boosting
  182. GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
  183. max_depth=4, max_features='sqrt',
  184. min_samples_leaf=15, min_samples_split=10,
  185. loss='huber', random_state =5).fit(x_train, y_train)
  186. train_test(GBest, x_train, x_test, y_train, y_test)
  187. # Average R2 score and standart deviation of 5-fold cross-validation
  188. scores = cross_val_score(GBest, train_features, train_labels, cv=5)
  189. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  190. XGBoost
  191. XGBest = xgb.XGBRegressor(max_depth=3, learning_rate=0.05, n_estimators=3000).fit(x_train, y_train)
  192. train_test(XGBest, x_train, x_test, y_train, y_test)
  193. # Average R2 score and standart deviation of 5-fold cross-validation
  194. scores = cross_val_score(XGBest, train_features, train_labels, cv=5)
  195. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  196. '''
  197. # Retraining models
  198. ENST_model = ENSTest.fit(train_features, train_labels)
  199. '''
  200. GB_model = GBest.fit(train_features, train_labels)
  201. XGB_model = XGBest.fit(train_features, train_labels)
  202. '''
  203. ## Getting our SalePrice estimation
  204. Final_labels = np.exp(ENST_model.predict(test_features))
  205. ## Saving to CSV
  206. pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('submission-8.csv', index =False)