# -*- coding: utf-8 -*- # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook # Also, part of the preprocessing has been inspired by this kernel from Serigne # https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard # Adding needed libraries and reading data import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn import ensemble, tree, linear_model, preprocessing from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import r2_score, mean_squared_error from sklearn.utils import shuffle from scipy import stats from scipy.stats import norm, skew, boxcox from scipy.special import boxcox1p import xgboost as xgb import warnings warnings.filterwarnings('ignore') train = pd.read_csv("../../train.csv") test = pd.read_csv("../../test.csv") # Prints R2 and RMSE scores def get_score(prediction, labels): print('R2: {}'.format(r2_score(prediction, labels))) print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels)))) print('RMSLE: {}'.format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean()))) # Shows scores for train and validation sets def train_test(estimator, x_trn, x_tst, y_trn, y_tst): prediction_train = estimator.predict(x_trn) # Printing estimator print(estimator) # Printing train scores get_score(prediction_train, y_trn) prediction_test = estimator.predict(x_tst) # Printing test scores print("Test") get_score(prediction_test, y_tst) # Deleting outliers train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index) #We use the numpy fuction log1p which applies log(1+x) to all elements of the column train["SalePrice"] = np.log1p(train["SalePrice"]) #Check the new distribution sns.distplot(train['SalePrice'] , fit=norm); # Get the fitted parameters used by the function (mu, sigma) = norm.fit(train['SalePrice']) print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) #Now plot the distribution plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') plt.title('SalePrice distribution') #Get also the QQ-plot fig = plt.figure() res = stats.probplot(train['SalePrice'], plot=plt) #plt.show() # Splitting to features and lables and deleting variables I don't need train_labels = train.pop('SalePrice') # Test set does not even have a 'SalePrice' column, so both sets can be concatenated features = pd.concat([train, test], keys=['train', 'test']) features.drop(['Utilities'], axis=1, inplace=True) # Checking for missing data, showing every variable with at least one missing value in train set total_missing_data = features.isnull().sum().sort_values(ascending=False) missing_data_percent = (features.isnull().sum()/features.isnull().count()).sort_values(ascending=False) missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent']) print(missing_data[missing_data['Percent']> 0]) # Converting OverallCond and OverallQual to str features.OverallCond = features.OverallCond.astype(str) features.OverallQual = features.OverallQual.astype(str) # MSSubClass as str features['MSSubClass'] = features['MSSubClass'].astype(str) # MSZoning NA in pred. filling with most popular values features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0]) # LotFrontage NA filling with median according to its OverallQual value median = features.groupby('OverallQual')['LotFrontage'].transform('median') features['LotFrontage'] = features['LotFrontage'].fillna(median) # Alley NA in all. NA means no access features['Alley'] = features['Alley'].fillna('NOACCESS') # MasVnrArea NA filling with median according to its OverallQual value median = features.groupby('OverallQual')['MasVnrArea'].transform('median') features['MasVnrArea'] = features['MasVnrArea'].fillna(median) # MasVnrType NA in all. filling with most popular values features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0]) # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2 # NA in all. NA means No basement for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'): features[col] = features[col].fillna('NoBSMT') # TotalBsmtSF NA in pred. I suppose NA means 0 features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0) # Electrical NA in pred. filling with most popular values features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0]) # KitchenAbvGr to categorical features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str) # KitchenQual NA in pred. filling with most popular values features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0]) # FireplaceQu NA in all. NA means No Fireplace features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP') # Garage-like features NA in all. NA means No Garage for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageYrBlt', 'GarageCond'): features[col] = features[col].fillna('NoGRG') # GarageCars and GarageArea NA in pred. I suppose NA means 0 for col in ('GarageCars', 'GarageArea'): features[col] = features[col].fillna(0.0) # SaleType NA in pred. filling with most popular values features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0]) # PoolQC NA in all. NA means No Pool features['PoolQC'] = features['PoolQC'].fillna('NoPool') # MiscFeature NA in all. NA means None features['MiscFeature'] = features['MiscFeature'].fillna('None') # Fence NA in all. NA means no fence features['Fence'] = features['Fence'].fillna('NoFence') # BsmtHalfBath and BsmtFullBath NA means 0 for col in ('BsmtHalfBath', 'BsmtFullBath'): features[col] = features[col].fillna(0) # Functional NA means Typ features['Functional'] = features['Functional'].fillna('Typ') # NA in Bsmt SF variables means not that type of basement, 0 square feet for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF'): features[col] = features[col].fillna(0) # NA in Exterior1st and Exterior2nd filled with the most common value for col in ('Exterior1st', 'Exterior2nd'): features[col] = features[col].fillna(features[col].mode()[0]) # Year and Month to categorical features['YrSold'] = features['YrSold'].astype(str) features['MoSold'] = features['MoSold'].astype(str) # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF'] features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True) # Getting Dummies from Condition1 and Condition2 conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']]) dummies = pd.DataFrame(data=np.zeros((len(features.index), len(conditions))), index=features.index, columns=conditions) for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])): dummies.ix[i, cond] = 1 features = pd.concat([features, dummies.add_prefix('Condition_')], axis=1) features.drop(['Condition1', 'Condition2'], axis=1, inplace=True) # Getting Dummies from Exterior1st and Exterior2nd exteriors = set([x for x in features['Exterior1st']] + [x for x in features['Exterior2nd']]) dummies = pd.DataFrame(data=np.zeros((len(features.index), len(exteriors))), index=features.index, columns=exteriors) for i, ext in enumerate(zip(features['Exterior1st'], features['Exterior2nd'])): dummies.ix[i, ext] = 1 features = pd.concat([features, dummies.add_prefix('Exterior_')], axis=1) features.drop(['Exterior1st', 'Exterior2nd'], axis=1, inplace=True) # Getting Dummies from all other categorical vars for col in features.dtypes[features.dtypes == 'object'].index: for_dummy = features.pop(col) features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1) ### Copying features features_normalized = features.copy() # Box-cox transformation for col in features_normalized.dtypes[features_normalized.dtypes != "object"].index: features_normalized[col] = boxcox1p(features_normalized[col], 0.15) ### Splitting features train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values ### Splitting normalized features train_features_st = features_normalized.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values test_features_st = features_normalized.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values ### Shuffling train sets train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5) ### Splitting x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200) x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200) ''' Elastic Net ''' ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st) train_test(ENSTest, x_train_st, x_test_st, y_train_st, y_test_st) # Average R2 score and standard deviation of 5-fold cross-validation scores = cross_val_score(ENSTest, train_features_st, train_labels, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ''' Gradient Boosting ''' GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber').fit(x_train, y_train) train_test(GBest, x_train, x_test, y_train, y_test) # Average R2 score and standart deviation of 5-fold cross-validation scores = cross_val_score(GBest, train_features_st, train_labels, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ''' XGBoost ''' XGBest = xgb.XGBRegressor(max_depth=3, learning_rate=0.05, n_estimators=3000).fit(x_train, y_train) train_test(XGBest, x_train, x_test, y_train, y_test) # Average R2 score and standart deviation of 5-fold cross-validation scores = cross_val_score(XGBest, train_features_st, train_labels, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Retraining models GB_model = GBest.fit(train_features, train_labels) ENST_model = ENSTest.fit(train_features_st, train_labels) XGB_model = XGBest.fit(train_features, train_labels) ## Getting our SalePrice estimation Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st)) + np.exp(XGB_model.predict(test_features))) / 3 ## Saving to CSV pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('submission-7.csv', index =False)