kaggle-4.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. # -*- coding: utf-8 -*-
  2. # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
  3. # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
  4. # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
  5. # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
  6. # Adding needed libraries and reading data
  7. import pandas as pd
  8. import numpy as np
  9. import matplotlib.pyplot as plt
  10. import seaborn as sns
  11. from sklearn import ensemble, tree, linear_model
  12. from sklearn.model_selection import train_test_split, cross_val_score
  13. from sklearn.metrics import r2_score, mean_squared_error
  14. from sklearn.utils import shuffle
  15. import xgboost as xgb
  16. import warnings
  17. warnings.filterwarnings('ignore')
  18. train = pd.read_csv("../../train.csv")
  19. test = pd.read_csv("../../test.csv")
  20. # Prints R2 and RMSE scores
  21. def get_score(prediction, labels):
  22. print('R2: {}'.format(r2_score(prediction, labels)))
  23. print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels))))
  24. print('RMSLE: {}'.format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean())))
  25. # Shows scores for train and validation sets
  26. def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
  27. prediction_train = estimator.predict(x_trn)
  28. # Printing estimator
  29. print(estimator)
  30. # Printing train scores
  31. get_score(prediction_train, y_trn)
  32. prediction_test = estimator.predict(x_tst)
  33. # Printing test scores
  34. print("Test")
  35. get_score(prediction_test, y_tst)
  36. # Splitting to features and lables and deleting variables I don't need
  37. train_labels = train.pop('SalePrice')
  38. # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
  39. features = pd.concat([train, test], keys=['train', 'test'])
  40. '''
  41. # I decided to get rid of features that have more than half of missing information or do not correlate to SalePrice
  42. features.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF',
  43. 'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF',
  44. 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'],
  45. axis=1, inplace=True)
  46. '''
  47. # Checking for missing data, showing every variable with at least one missing value in train set
  48. total_missing_data = features.isnull().sum().sort_values(ascending=False)
  49. missing_data_percent = (features.isnull().sum()/features.isnull().count()).sort_values(ascending=False)
  50. missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
  51. print(missing_data[missing_data['Percent']> 0])
  52. # I get rid of the features that have a lot of missing data
  53. features.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'LotFrontage'], axis=1, inplace=True)
  54. # Now I drop those features with duplicated information
  55. features.drop(['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageArea', 'GarageQual', 'GarageCond',
  56. '3SsnPorch', 'ScreenPorch', 'BsmtQual', 'BsmtCond', 'Heating', 'LandSlope', 'Exterior1st',
  57. 'Exterior2nd', 'KitchenAbvGr', 'BedroomAbvGr', 'Fireplaces'], axis=1, inplace=True)
  58. # Now the same for those features that seem non-related to SalePrice, or do not give much information about it
  59. features.drop(['BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath',
  60. 'BsmtHalfBath', 'BsmtUnfSF', 'Utilities', 'Street', 'MasVnrType', 'MasVnrArea'], axis=1, inplace=True)
  61. # Doubts?
  62. features.drop(['RoofMatl'], axis=1, inplace=True)
  63. # MSSubClass as str
  64. features['MSSubClass'] = features['MSSubClass'].astype(str)
  65. # MSZoning NA in pred. filling with most popular values
  66. features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
  67. # Converting OverallCond to categorical
  68. features.OverallCond = features.OverallCond.astype(str)
  69. # TotalBsmtSF NA in pred. I suppose NA means 0
  70. features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
  71. # Electrical NA in pred. filling with most popular values
  72. features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
  73. # KitchenQual NA in pred. filling with most popular values
  74. features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
  75. # FireplaceQu NA in all. NA means No Fireplace
  76. features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP')
  77. # GarageCars NA in pred. I suppose NA means 0
  78. features['GarageCars'] = features['GarageCars'].fillna(0.0)
  79. # SaleType NA in pred. filling with most popular values
  80. features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
  81. # Year and Month to categorical
  82. features['YrSold'] = features['YrSold'].astype(str)
  83. features['MoSold'] = features['MoSold'].astype(str)
  84. # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
  85. features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
  86. features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
  87. # Our SalesPrice is skewed right (check plot below). I'm logtransforming it.
  88. plt.figure(1)
  89. plt.clf()
  90. ax = sns.distplot(train_labels)
  91. #plt.show()
  92. ## Log transformation of labels
  93. train_labels = np.log(train_labels)
  94. ## Now it looks much better
  95. plt.figure(2)
  96. plt.clf()
  97. ax = sns.distplot(train_labels)
  98. #plt.show()
  99. ## Standardizing numeric features
  100. numeric_features = features.loc[:,['LotFrontage', 'LotArea', 'GrLivArea', 'TotalSF']]
  101. numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()
  102. #ax = sns.pairplot(numeric_features_standardized)
  103. # Getting Dummies from Condition1 and Condition2
  104. conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']])
  105. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(conditions))),
  106. index=features.index, columns=conditions)
  107. for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])):
  108. dummies.ix[i, cond] = 1
  109. features = pd.concat([features, dummies.add_prefix('Condition_')], axis=1)
  110. features.drop(['Condition1', 'Condition2'], axis=1, inplace=True)
  111. # Getting Dummies from all other categorical vars
  112. for col in features.dtypes[features.dtypes == 'object'].index:
  113. for_dummy = features.pop(col)
  114. features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)
  115. ### Copying features
  116. features_standardized = features.copy()
  117. ### Replacing numeric features by standardized values
  118. features_standardized.update(numeric_features_standardized)
  119. ### Splitting features
  120. train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  121. test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  122. ### Splitting standardized features
  123. train_features_st = features_standardized.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  124. test_features_st = features_standardized.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  125. ### Shuffling train sets
  126. train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5)
  127. ### Splitting
  128. x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
  129. x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)
  130. '''
  131. Elastic Net
  132. '''
  133. ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st)
  134. train_test(ENSTest, x_train_st, x_test_st, y_train_st, y_test_st)
  135. # Average R2 score and standard deviation of 5-fold cross-validation
  136. scores = cross_val_score(ENSTest, train_features_st, train_labels, cv=5)
  137. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  138. '''
  139. Gradient Boosting
  140. '''
  141. GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
  142. min_samples_leaf=15, min_samples_split=10, loss='huber').fit(x_train, y_train)
  143. train_test(GBest, x_train, x_test, y_train, y_test)
  144. # Average R2 score and standart deviation of 5-fold cross-validation
  145. scores = cross_val_score(GBest, train_features_st, train_labels, cv=5)
  146. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  147. '''
  148. XGBoost
  149. '''
  150. XGBest = xgb.XGBRegressor(max_depth=3, learning_rate=0.05, n_estimators=3000).fit(x_train, y_train)
  151. train_test(XGBest, x_train, x_test, y_train, y_test)
  152. # Average R2 score and standart deviation of 5-fold cross-validation
  153. scores = cross_val_score(XGBest, train_features_st, train_labels, cv=5)
  154. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  155. # Retraining models
  156. GB_model = GBest.fit(train_features, train_labels)
  157. ENST_model = ENSTest.fit(train_features_st, train_labels)
  158. XGB_model = XGBest.fit(train_features, train_labels)
  159. ## Getting our SalePrice estimation
  160. Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st))
  161. + np.exp(XGB_model.predict(test_features))) / 3
  162. ## Saving to CSV
  163. pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('submission-4.csv', index =False)