kaggle-5.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. # -*- coding: utf-8 -*-
  2. # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
  3. # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
  4. # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
  5. # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
  6. # Adding needed libraries and reading data
  7. import pandas as pd
  8. import numpy as np
  9. import matplotlib.pyplot as plt
  10. import seaborn as sns
  11. from sklearn import ensemble, tree, linear_model
  12. from sklearn.model_selection import train_test_split, cross_val_score
  13. from sklearn.metrics import r2_score, mean_squared_error
  14. from sklearn.utils import shuffle
  15. from scipy import stats
  16. from scipy.stats import norm, skew
  17. import xgboost as xgb
  18. import warnings
  19. warnings.filterwarnings('ignore')
  20. train = pd.read_csv("../../train.csv")
  21. test = pd.read_csv("../../test.csv")
  22. # Prints R2 and RMSE scores
  23. def get_score(prediction, labels):
  24. print('R2: {}'.format(r2_score(prediction, labels)))
  25. print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels))))
  26. print('RMSLE: {}'.format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean())))
  27. # Shows scores for train and validation sets
  28. def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
  29. prediction_train = estimator.predict(x_trn)
  30. # Printing estimator
  31. print(estimator)
  32. # Printing train scores
  33. get_score(prediction_train, y_trn)
  34. prediction_test = estimator.predict(x_tst)
  35. # Printing test scores
  36. print("Test")
  37. get_score(prediction_test, y_tst)
  38. # Deleting outliers
  39. train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
  40. #We use the numpy fuction log1p which applies log(1+x) to all elements of the column
  41. train["SalePrice"] = np.log1p(train["SalePrice"])
  42. #Check the new distribution
  43. sns.distplot(train['SalePrice'] , fit=norm);
  44. # Get the fitted parameters used by the function
  45. (mu, sigma) = norm.fit(train['SalePrice'])
  46. print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
  47. #Now plot the distribution
  48. plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
  49. loc='best')
  50. plt.ylabel('Frequency')
  51. plt.title('SalePrice distribution')
  52. #Get also the QQ-plot
  53. fig = plt.figure()
  54. res = stats.probplot(train['SalePrice'], plot=plt)
  55. plt.show()
  56. # Splitting to features and lables and deleting variables I don't need
  57. train_labels = train.pop('SalePrice')
  58. # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
  59. features = pd.concat([train, test], keys=['train', 'test'])
  60. # I decided to get rid of features that have more than half of missing information or do not correlate to SalePrice
  61. features.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF',
  62. 'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF',
  63. 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'],
  64. axis=1, inplace=True)
  65. # Checking for missing data, showing every variable with at least one missing value in train set
  66. total_missing_data = features.isnull().sum().sort_values(ascending=False)
  67. missing_data_percent = (features.isnull().sum()/features.isnull().count()).sort_values(ascending=False)
  68. missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
  69. print(missing_data[missing_data['Percent']> 0])
  70. # Converting OverallCond and OverallQual to str
  71. features.OverallCond = features.OverallCond.astype(str)
  72. features.OverallQual = features.OverallQual.astype(str)
  73. # MSSubClass as str
  74. features['MSSubClass'] = features['MSSubClass'].astype(str)
  75. # MSZoning NA in pred. filling with most popular values
  76. features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
  77. # LotFrontage NA filling with median according to its OverallQual value
  78. median = features.groupby('OverallQual')['LotFrontage'].transform('median')
  79. features['LotFrontage'] = features['LotFrontage'].fillna(median)
  80. # Alley NA in all. NA means no access
  81. features['Alley'] = features['Alley'].fillna('NOACCESS')
  82. # MasVnrType NA in all. filling with most popular values
  83. features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
  84. # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
  85. # NA in all. NA means No basement
  86. for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
  87. features[col] = features[col].fillna('NoBSMT')
  88. # TotalBsmtSF NA in pred. I suppose NA means 0
  89. features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
  90. # Electrical NA in pred. filling with most popular values
  91. features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
  92. # KitchenAbvGr to categorical
  93. features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
  94. # KitchenQual NA in pred. filling with most popular values
  95. features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
  96. # FireplaceQu NA in all. NA means No Fireplace
  97. features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP')
  98. # GarageType, GarageFinish, GarageQual NA in all. NA means No Garage
  99. for col in ('GarageType', 'GarageFinish', 'GarageQual'):
  100. features[col] = features[col].fillna('NoGRG')
  101. # GarageCars NA in pred. I suppose NA means 0
  102. features['GarageCars'] = features['GarageCars'].fillna(0.0)
  103. # SaleType NA in pred. filling with most popular values
  104. features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
  105. # Year and Month to categorical
  106. features['YrSold'] = features['YrSold'].astype(str)
  107. features['MoSold'] = features['MoSold'].astype(str)
  108. # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
  109. features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
  110. features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
  111. ## Standardizing numeric features
  112. numeric_features = features.loc[:,['LotFrontage', 'LotArea', 'GrLivArea', 'TotalSF']]
  113. numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()
  114. #ax = sns.pairplot(numeric_features_standardized)
  115. # Getting Dummies from Condition1 and Condition2
  116. conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']])
  117. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(conditions))),
  118. index=features.index, columns=conditions)
  119. for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])):
  120. dummies.ix[i, cond] = 1
  121. features = pd.concat([features, dummies.add_prefix('Condition_')], axis=1)
  122. features.drop(['Condition1', 'Condition2'], axis=1, inplace=True)
  123. # Getting Dummies from Exterior1st and Exterior2nd
  124. exteriors = set([x for x in features['Exterior1st']] + [x for x in features['Exterior2nd']])
  125. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(exteriors))),
  126. index=features.index, columns=exteriors)
  127. for i, ext in enumerate(zip(features['Exterior1st'], features['Exterior2nd'])):
  128. dummies.ix[i, ext] = 1
  129. features = pd.concat([features, dummies.add_prefix('Exterior_')], axis=1)
  130. features.drop(['Exterior1st', 'Exterior2nd', 'Exterior_nan'], axis=1, inplace=True)
  131. # Getting Dummies from all other categorical vars
  132. for col in features.dtypes[features.dtypes == 'object'].index:
  133. for_dummy = features.pop(col)
  134. features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)
  135. ### Copying features
  136. features_standardized = features.copy()
  137. ### Replacing numeric features by standardized values
  138. features_standardized.update(numeric_features_standardized)
  139. ### Splitting features
  140. train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  141. test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  142. ### Splitting standardized features
  143. train_features_st = features_standardized.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  144. test_features_st = features_standardized.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  145. ### Shuffling train sets
  146. train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5)
  147. ### Splitting
  148. x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
  149. x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)
  150. '''
  151. Elastic Net
  152. '''
  153. ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st)
  154. train_test(ENSTest, x_train_st, x_test_st, y_train_st, y_test_st)
  155. # Average R2 score and standard deviation of 5-fold cross-validation
  156. scores = cross_val_score(ENSTest, train_features_st, train_labels, cv=5)
  157. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  158. '''
  159. Gradient Boosting
  160. '''
  161. GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
  162. min_samples_leaf=15, min_samples_split=10, loss='huber').fit(x_train, y_train)
  163. train_test(GBest, x_train, x_test, y_train, y_test)
  164. # Average R2 score and standart deviation of 5-fold cross-validation
  165. scores = cross_val_score(GBest, train_features_st, train_labels, cv=5)
  166. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  167. '''
  168. XGBoost
  169. '''
  170. XGBest = xgb.XGBRegressor(max_depth=3, learning_rate=0.05, n_estimators=3000).fit(x_train, y_train)
  171. train_test(XGBest, x_train, x_test, y_train, y_test)
  172. # Average R2 score and standart deviation of 5-fold cross-validation
  173. scores = cross_val_score(XGBest, train_features_st, train_labels, cv=5)
  174. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  175. # Retraining models
  176. GB_model = GBest.fit(train_features, train_labels)
  177. ENST_model = ENSTest.fit(train_features_st, train_labels)
  178. XGB_model = XGBest.fit(train_features, train_labels)
  179. ## Getting our SalePrice estimation
  180. Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st))
  181. + np.exp(XGB_model.predict(test_features))) / 3
  182. ## Saving to CSV
  183. pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('submission-5.csv', index =False)