kaggle-6.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. # -*- coding: utf-8 -*-
  2. # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
  3. # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
  4. # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
  5. # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
  6. # Adding needed libraries and reading data
  7. import pandas as pd
  8. import numpy as np
  9. import matplotlib.pyplot as plt
  10. import seaborn as sns
  11. from sklearn import ensemble, tree, linear_model
  12. from sklearn.model_selection import train_test_split, cross_val_score
  13. from sklearn.metrics import r2_score, mean_squared_error
  14. from sklearn.utils import shuffle
  15. from scipy import stats
  16. from scipy.stats import norm, skew
  17. import xgboost as xgb
  18. import warnings
  19. warnings.filterwarnings('ignore')
  20. train = pd.read_csv("../../train.csv")
  21. test = pd.read_csv("../../test.csv")
  22. # Prints R2 and RMSE scores
  23. def get_score(prediction, labels):
  24. print('R2: {}'.format(r2_score(prediction, labels)))
  25. print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels))))
  26. print('RMSLE: {}'.format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean())))
  27. # Shows scores for train and validation sets
  28. def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
  29. prediction_train = estimator.predict(x_trn)
  30. # Printing estimator
  31. print(estimator)
  32. # Printing train scores
  33. get_score(prediction_train, y_trn)
  34. prediction_test = estimator.predict(x_tst)
  35. # Printing test scores
  36. print("Test")
  37. get_score(prediction_test, y_tst)
  38. # Deleting outliers
  39. train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
  40. #We use the numpy fuction log1p which applies log(1+x) to all elements of the column
  41. train["SalePrice"] = np.log1p(train["SalePrice"])
  42. #Check the new distribution
  43. sns.distplot(train['SalePrice'] , fit=norm);
  44. # Get the fitted parameters used by the function
  45. (mu, sigma) = norm.fit(train['SalePrice'])
  46. print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
  47. #Now plot the distribution
  48. plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
  49. loc='best')
  50. plt.ylabel('Frequency')
  51. plt.title('SalePrice distribution')
  52. #Get also the QQ-plot
  53. fig = plt.figure()
  54. res = stats.probplot(train['SalePrice'], plot=plt)
  55. plt.show()
  56. # Splitting to features and lables and deleting variables I don't need
  57. train_labels = train.pop('SalePrice')
  58. # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
  59. features = pd.concat([train, test], keys=['train', 'test'])
  60. '''
  61. # I decided to get rid of features that have more than half of missing information or do not correlate to SalePrice
  62. features.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF',
  63. 'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF',
  64. 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'],
  65. axis=1, inplace=True)
  66. '''
  67. features.drop(['Utilities'], axis=1, inplace=True)
  68. # Checking for missing data, showing every variable with at least one missing value in train set
  69. total_missing_data = features.isnull().sum().sort_values(ascending=False)
  70. missing_data_percent = (features.isnull().sum()/features.isnull().count()).sort_values(ascending=False)
  71. missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
  72. print(missing_data[missing_data['Percent']> 0])
  73. # Converting OverallCond and OverallQual to str
  74. features.OverallCond = features.OverallCond.astype(str)
  75. features.OverallQual = features.OverallQual.astype(str)
  76. # MSSubClass as str
  77. features['MSSubClass'] = features['MSSubClass'].astype(str)
  78. # MSZoning NA in pred. filling with most popular values
  79. features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
  80. # LotFrontage NA filling with median according to its OverallQual value
  81. median = features.groupby('OverallQual')['LotFrontage'].transform('median')
  82. features['LotFrontage'] = features['LotFrontage'].fillna(median)
  83. # Alley NA in all. NA means no access
  84. features['Alley'] = features['Alley'].fillna('NOACCESS')
  85. # MasVnrArea NA filling with median according to its OverallQual value
  86. median = features.groupby('OverallQual')['MasVnrArea'].transform('median')
  87. features['MasVnrArea'] = features['MasVnrArea'].fillna(median)
  88. # MasVnrType NA in all. filling with most popular values
  89. features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
  90. # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
  91. # NA in all. NA means No basement
  92. for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
  93. features[col] = features[col].fillna('NoBSMT')
  94. # TotalBsmtSF NA in pred. I suppose NA means 0
  95. features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
  96. # Electrical NA in pred. filling with most popular values
  97. features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
  98. # KitchenAbvGr to categorical
  99. features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
  100. # KitchenQual NA in pred. filling with most popular values
  101. features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
  102. # FireplaceQu NA in all. NA means No Fireplace
  103. features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP')
  104. # Garage-like features NA in all. NA means No Garage
  105. for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageYrBlt', 'GarageCond'):
  106. features[col] = features[col].fillna('NoGRG')
  107. # GarageCars and GarageArea NA in pred. I suppose NA means 0
  108. for col in ('GarageCars', 'GarageArea'):
  109. features[col] = features[col].fillna(0.0)
  110. # SaleType NA in pred. filling with most popular values
  111. features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
  112. # PoolQC NA in all. NA means No Pool
  113. features['PoolQC'] = features['PoolQC'].fillna('NoPool')
  114. # MiscFeature NA in all. NA means None
  115. features['MiscFeature'] = features['MiscFeature'].fillna('None')
  116. # Fence NA in all. NA means no fence
  117. features['Fence'] = features['Fence'].fillna('NoFence')
  118. # BsmtHalfBath and BsmtFullBath NA means 0
  119. for col in ('BsmtHalfBath', 'BsmtFullBath'):
  120. features[col] = features[col].fillna(0)
  121. # Functional NA means Typ
  122. features['Functional'] = features['Functional'].fillna('Typ')
  123. # NA in Bsmt SF variables means not that type of basement, 0 square feet
  124. for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF'):
  125. features[col] = features[col].fillna(0)
  126. # NA in Exterior1st and Exterior2nd filled with the most common value
  127. for col in ('Exterior1st', 'Exterior2nd'):
  128. features[col] = features[col].fillna(features[col].mode()[0])
  129. # Year and Month to categorical
  130. features['YrSold'] = features['YrSold'].astype(str)
  131. features['MoSold'] = features['MoSold'].astype(str)
  132. # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
  133. features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
  134. features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
  135. ## Standardizing numeric features
  136. numeric_features = features.loc[:,['LotFrontage', 'LotArea', 'GrLivArea', 'TotalSF']]
  137. numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()
  138. #ax = sns.pairplot(numeric_features_standardized)
  139. # Getting Dummies from Condition1 and Condition2
  140. conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']])
  141. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(conditions))),
  142. index=features.index, columns=conditions)
  143. for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])):
  144. dummies.ix[i, cond] = 1
  145. features = pd.concat([features, dummies.add_prefix('Condition_')], axis=1)
  146. features.drop(['Condition1', 'Condition2'], axis=1, inplace=True)
  147. # Getting Dummies from Exterior1st and Exterior2nd
  148. exteriors = set([x for x in features['Exterior1st']] + [x for x in features['Exterior2nd']])
  149. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(exteriors))),
  150. index=features.index, columns=exteriors)
  151. for i, ext in enumerate(zip(features['Exterior1st'], features['Exterior2nd'])):
  152. dummies.ix[i, ext] = 1
  153. features = pd.concat([features, dummies.add_prefix('Exterior_')], axis=1)
  154. features.drop(['Exterior1st', 'Exterior2nd'], axis=1, inplace=True)
  155. # Getting Dummies from all other categorical vars
  156. for col in features.dtypes[features.dtypes == 'object'].index:
  157. for_dummy = features.pop(col)
  158. features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)
  159. ### Copying features
  160. features_standardized = features.copy()
  161. ### Replacing numeric features by standardized values
  162. features_standardized.update(numeric_features_standardized)
  163. ### Splitting features
  164. train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  165. test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  166. ### Splitting standardized features
  167. train_features_st = features_standardized.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  168. test_features_st = features_standardized.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  169. ### Shuffling train sets
  170. train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5)
  171. ### Splitting
  172. x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
  173. x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)
  174. '''
  175. Elastic Net
  176. '''
  177. ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st)
  178. train_test(ENSTest, x_train_st, x_test_st, y_train_st, y_test_st)
  179. # Average R2 score and standard deviation of 5-fold cross-validation
  180. scores = cross_val_score(ENSTest, train_features_st, train_labels, cv=5)
  181. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  182. '''
  183. Gradient Boosting
  184. '''
  185. GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
  186. min_samples_leaf=15, min_samples_split=10, loss='huber').fit(x_train, y_train)
  187. train_test(GBest, x_train, x_test, y_train, y_test)
  188. # Average R2 score and standart deviation of 5-fold cross-validation
  189. scores = cross_val_score(GBest, train_features_st, train_labels, cv=5)
  190. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  191. '''
  192. XGBoost
  193. '''
  194. XGBest = xgb.XGBRegressor(max_depth=3, learning_rate=0.05, n_estimators=3000).fit(x_train, y_train)
  195. train_test(XGBest, x_train, x_test, y_train, y_test)
  196. # Average R2 score and standart deviation of 5-fold cross-validation
  197. scores = cross_val_score(XGBest, train_features_st, train_labels, cv=5)
  198. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  199. # Retraining models
  200. GB_model = GBest.fit(train_features, train_labels)
  201. ENST_model = ENSTest.fit(train_features_st, train_labels)
  202. XGB_model = XGBest.fit(train_features, train_labels)
  203. ## Getting our SalePrice estimation
  204. Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st))
  205. + np.exp(XGB_model.predict(test_features))) / 3
  206. ## Saving to CSV
  207. pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('submission-6.csv', index =False)