kaggle-1.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. # -*- coding: utf-8 -*-
  2. # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
  3. # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
  4. # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
  5. # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
  6. # Adding needed libraries and reading data
  7. import pandas as pd
  8. import numpy as np
  9. import matplotlib.pyplot as plt
  10. import seaborn as sns
  11. from sklearn import ensemble, tree, linear_model
  12. from sklearn.model_selection import train_test_split, cross_val_score
  13. from sklearn.metrics import r2_score, mean_squared_error
  14. from sklearn.utils import shuffle
  15. import warnings
  16. warnings.filterwarnings('ignore')
  17. train = pd.read_csv("../../train.csv")
  18. test = pd.read_csv("../../test.csv")
  19. train.head()
  20. # Checking for missing data, showing every variable with at least one missing value in train set
  21. total_missing_data = train.isnull().sum().sort_values(ascending=False)
  22. missing_data_percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
  23. missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
  24. print(missing_data[missing_data['Percent']> 0])
  25. '''
  26. # Let's get rid of the missing data
  27. train = train.drop((missing_data[missing_data['Total'] > 0]).index,1)
  28. '''
  29. # Prints R2 and RMSE scores
  30. def get_score(prediction, labels):
  31. print('R2: {}'.format(r2_score(prediction, labels)))
  32. print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels))))
  33. print('RMSLE: {}'.format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean())))
  34. # Shows scores for train and validation sets
  35. def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
  36. prediction_train = estimator.predict(x_trn)
  37. # Printing estimator
  38. print(estimator)
  39. # Printing train scores
  40. get_score(prediction_train, y_trn)
  41. prediction_test = estimator.predict(x_tst)
  42. # Printing test scores
  43. print("Test")
  44. get_score(prediction_test, y_tst)
  45. # Splitting to features and lables and deleting variables I don't need
  46. train_labels = train.pop('SalePrice')
  47. # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
  48. features = pd.concat([train, test], keys=['train', 'test'])
  49. # I decided to get rid of features that have more than half of missing information or do not correlate to SalePrice
  50. features.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF',
  51. 'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF',
  52. 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'],
  53. axis=1, inplace=True)
  54. # MSSubClass as str
  55. features['MSSubClass'] = features['MSSubClass'].astype(str)
  56. # MSZoning NA in pred. filling with most popular values
  57. features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
  58. # LotFrontage NA in all. I suppose NA means 0
  59. features['LotFrontage'] = features['LotFrontage'].fillna(features['LotFrontage'].mean())
  60. # Alley NA in all. NA means no access
  61. features['Alley'] = features['Alley'].fillna('NOACCESS')
  62. # Converting OverallCond to str
  63. features.OverallCond = features.OverallCond.astype(str)
  64. # MasVnrType NA in all. filling with most popular values
  65. features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
  66. # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
  67. # NA in all. NA means No basement
  68. for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
  69. features[col] = features[col].fillna('NoBSMT')
  70. # TotalBsmtSF NA in pred. I suppose NA means 0
  71. features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
  72. # Electrical NA in pred. filling with most popular values
  73. features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
  74. # KitchenAbvGr to categorical
  75. features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
  76. # KitchenQual NA in pred. filling with most popular values
  77. features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
  78. # FireplaceQu NA in all. NA means No Fireplace
  79. features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP')
  80. # GarageType, GarageFinish, GarageQual NA in all. NA means No Garage
  81. for col in ('GarageType', 'GarageFinish', 'GarageQual'):
  82. features[col] = features[col].fillna('NoGRG')
  83. # GarageCars NA in pred. I suppose NA means 0
  84. features['GarageCars'] = features['GarageCars'].fillna(0.0)
  85. # SaleType NA in pred. filling with most popular values
  86. features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
  87. # Year and Month to categorical
  88. features['YrSold'] = features['YrSold'].astype(str)
  89. features['MoSold'] = features['MoSold'].astype(str)
  90. # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
  91. features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
  92. features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
  93. # Our SalesPrice is skewed right (check plot below). I'm logtransforming it.
  94. plt.figure(1)
  95. plt.clf()
  96. ax = sns.distplot(train_labels)
  97. #plt.show()
  98. ## Log transformation of labels
  99. train_labels = np.log(train_labels)
  100. ## Now it looks much better
  101. plt.figure(2)
  102. plt.clf()
  103. ax = sns.distplot(train_labels)
  104. #plt.show()
  105. ## Standardizing numeric features
  106. numeric_features = features.loc[:,['LotFrontage', 'LotArea', 'GrLivArea', 'TotalSF']]
  107. numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()
  108. #ax = sns.pairplot(numeric_features_standardized)
  109. # Getting Dummies from Condition1 and Condition2
  110. conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']])
  111. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(conditions))),
  112. index=features.index, columns=conditions)
  113. for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])):
  114. dummies.ix[i, cond] = 1
  115. features = pd.concat([features, dummies.add_prefix('Condition_')], axis=1)
  116. features.drop(['Condition1', 'Condition2'], axis=1, inplace=True)
  117. # Getting Dummies from Exterior1st and Exterior2nd
  118. exteriors = set([x for x in features['Exterior1st']] + [x for x in features['Exterior2nd']])
  119. dummies = pd.DataFrame(data=np.zeros((len(features.index), len(exteriors))),
  120. index=features.index, columns=exteriors)
  121. for i, ext in enumerate(zip(features['Exterior1st'], features['Exterior2nd'])):
  122. dummies.ix[i, ext] = 1
  123. features = pd.concat([features, dummies.add_prefix('Exterior_')], axis=1)
  124. features.drop(['Exterior1st', 'Exterior2nd', 'Exterior_nan'], axis=1, inplace=True)
  125. # Getting Dummies from all other categorical vars
  126. for col in features.dtypes[features.dtypes == 'object'].index:
  127. for_dummy = features.pop(col)
  128. features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)
  129. ### Copying features
  130. features_standardized = features.copy()
  131. ### Replacing numeric features by standardized values
  132. features_standardized.update(numeric_features_standardized)
  133. ### Splitting features
  134. train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  135. test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  136. ### Splitting standardized features
  137. train_features_st = features_standardized.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  138. test_features_st = features_standardized.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
  139. ### Shuffling train sets
  140. train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5)
  141. ### Splitting
  142. x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
  143. x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)
  144. '''
  145. Elastic Net
  146. '''
  147. ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st)
  148. train_test(ENSTest, x_train_st, x_test_st, y_train_st, y_test_st)
  149. # Average R2 score and standard deviation of 5-fold cross-validation
  150. scores = cross_val_score(ENSTest, train_features_st, train_labels, cv=5)
  151. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  152. '''
  153. Gradient Boosting
  154. '''
  155. GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
  156. min_samples_leaf=15, min_samples_split=10, loss='huber').fit(x_train, y_train)
  157. train_test(GBest, x_train, x_test, y_train, y_test)
  158. # Average R2 score and standart deviation of 5-fold cross-validation
  159. scores = cross_val_score(GBest, train_features_st, train_labels, cv=5)
  160. print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  161. # Retraining models
  162. GB_model = GBest.fit(train_features, train_labels)
  163. ENST_model = ENSTest.fit(train_features_st, train_labels)
  164. ## Getting our SalePrice estimation
  165. Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st))) / 2
  166. ## Saving to CSV
  167. pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('submission-1.csv', index =False)