123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236 |
- # -*- coding: utf-8 -*-
- # This code is initially based on the Kaggle kernel from Sergei Neviadomski, which can be found in the following link
- # https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn/notebook
- # and the Kaggle kernel from Pedro Marcelino, which can be found in the link below
- # https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
- # Adding needed libraries and reading data
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- from sklearn import ensemble, tree, linear_model
- from sklearn.model_selection import train_test_split, cross_val_score
- from sklearn.metrics import r2_score, mean_squared_error
- from sklearn.utils import shuffle
- from scipy import stats
- from scipy.stats import norm, skew
- import xgboost as xgb
- import warnings
- warnings.filterwarnings('ignore')
- train = pd.read_csv("../../train.csv")
- test = pd.read_csv("../../test.csv")
- # Prints R2 and RMSE scores
- def get_score(prediction, labels):
- print('R2: {}'.format(r2_score(prediction, labels)))
- print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels))))
- print('RMSLE: {}'.format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean())))
- # Shows scores for train and validation sets
- def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
- prediction_train = estimator.predict(x_trn)
- # Printing estimator
- print(estimator)
- # Printing train scores
- get_score(prediction_train, y_trn)
- prediction_test = estimator.predict(x_tst)
- # Printing test scores
- print("Test")
- get_score(prediction_test, y_tst)
- # Deleting outliers
- train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
- #We use the numpy fuction log1p which applies log(1+x) to all elements of the column
- train["SalePrice"] = np.log1p(train["SalePrice"])
- #Check the new distribution
- sns.distplot(train['SalePrice'] , fit=norm);
- # Get the fitted parameters used by the function
- (mu, sigma) = norm.fit(train['SalePrice'])
- print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
- #Now plot the distribution
- plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
- loc='best')
- plt.ylabel('Frequency')
- plt.title('SalePrice distribution')
- #Get also the QQ-plot
- fig = plt.figure()
- res = stats.probplot(train['SalePrice'], plot=plt)
- plt.show()
- # Splitting to features and lables and deleting variables I don't need
- train_labels = train.pop('SalePrice')
- # Test set does not even have a 'SalePrice' column, so both sets can be concatenated
- features = pd.concat([train, test], keys=['train', 'test'])
- # I decided to get rid of features that have more than half of missing information or do not correlate to SalePrice
- features.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF',
- 'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF',
- 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'],
- axis=1, inplace=True)
- # Checking for missing data, showing every variable with at least one missing value in train set
- total_missing_data = features.isnull().sum().sort_values(ascending=False)
- missing_data_percent = (features.isnull().sum()/features.isnull().count()).sort_values(ascending=False)
- missing_data = pd.concat([total_missing_data, missing_data_percent], axis=1, keys=['Total', 'Percent'])
- print(missing_data[missing_data['Percent']> 0])
- # Converting OverallCond and OverallQual to str
- features.OverallCond = features.OverallCond.astype(str)
- features.OverallQual = features.OverallQual.astype(str)
- # MSSubClass as str
- features['MSSubClass'] = features['MSSubClass'].astype(str)
- # MSZoning NA in pred. filling with most popular values
- features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
- # LotFrontage NA filling with median according to its OverallQual value
- median = features.groupby('OverallQual')['LotFrontage'].transform('median')
- features['LotFrontage'] = features['LotFrontage'].fillna(median)
- # Alley NA in all. NA means no access
- features['Alley'] = features['Alley'].fillna('NOACCESS')
- # MasVnrType NA in all. filling with most popular values
- features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
- # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
- # NA in all. NA means No basement
- for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
- features[col] = features[col].fillna('NoBSMT')
- # TotalBsmtSF NA in pred. I suppose NA means 0
- features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
- # Electrical NA in pred. filling with most popular values
- features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
- # KitchenAbvGr to categorical
- features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
- # KitchenQual NA in pred. filling with most popular values
- features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
- # FireplaceQu NA in all. NA means No Fireplace
- features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP')
- # GarageType, GarageFinish, GarageQual NA in all. NA means No Garage
- for col in ('GarageType', 'GarageFinish', 'GarageQual'):
- features[col] = features[col].fillna('NoGRG')
- # GarageCars NA in pred. I suppose NA means 0
- features['GarageCars'] = features['GarageCars'].fillna(0.0)
- # SaleType NA in pred. filling with most popular values
- features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
- # Year and Month to categorical
- features['YrSold'] = features['YrSold'].astype(str)
- features['MoSold'] = features['MoSold'].astype(str)
- # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
- features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
- features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
- ## Standardizing numeric features
- numeric_features = features.loc[:,['LotFrontage', 'LotArea', 'GrLivArea', 'TotalSF']]
- numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()
- #ax = sns.pairplot(numeric_features_standardized)
- # Getting Dummies from Condition1 and Condition2
- conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']])
- dummies = pd.DataFrame(data=np.zeros((len(features.index), len(conditions))),
- index=features.index, columns=conditions)
- for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])):
- dummies.ix[i, cond] = 1
- features = pd.concat([features, dummies.add_prefix('Condition_')], axis=1)
- features.drop(['Condition1', 'Condition2'], axis=1, inplace=True)
- # Getting Dummies from Exterior1st and Exterior2nd
- exteriors = set([x for x in features['Exterior1st']] + [x for x in features['Exterior2nd']])
- dummies = pd.DataFrame(data=np.zeros((len(features.index), len(exteriors))),
- index=features.index, columns=exteriors)
- for i, ext in enumerate(zip(features['Exterior1st'], features['Exterior2nd'])):
- dummies.ix[i, ext] = 1
- features = pd.concat([features, dummies.add_prefix('Exterior_')], axis=1)
- features.drop(['Exterior1st', 'Exterior2nd', 'Exterior_nan'], axis=1, inplace=True)
- # Getting Dummies from all other categorical vars
- for col in features.dtypes[features.dtypes == 'object'].index:
- for_dummy = features.pop(col)
- features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)
- ### Copying features
- features_standardized = features.copy()
- ### Replacing numeric features by standardized values
- features_standardized.update(numeric_features_standardized)
- ### Splitting features
- train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
- test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
- ### Splitting standardized features
- train_features_st = features_standardized.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
- test_features_st = features_standardized.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
- ### Shuffling train sets
- train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5)
- ### Splitting
- x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
- x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)
- '''
- Elastic Net
- '''
- ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st)
- train_test(ENSTest, x_train_st, x_test_st, y_train_st, y_test_st)
- # Average R2 score and standard deviation of 5-fold cross-validation
- scores = cross_val_score(ENSTest, train_features_st, train_labels, cv=5)
- print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
- '''
- Gradient Boosting
- '''
- GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
- min_samples_leaf=15, min_samples_split=10, loss='huber').fit(x_train, y_train)
- train_test(GBest, x_train, x_test, y_train, y_test)
- # Average R2 score and standart deviation of 5-fold cross-validation
- scores = cross_val_score(GBest, train_features_st, train_labels, cv=5)
- print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
- '''
- XGBoost
- '''
- XGBest = xgb.XGBRegressor(max_depth=3, learning_rate=0.05, n_estimators=3000).fit(x_train, y_train)
- train_test(XGBest, x_train, x_test, y_train, y_test)
- # Average R2 score and standart deviation of 5-fold cross-validation
- scores = cross_val_score(XGBest, train_features_st, train_labels, cv=5)
- print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
- # Retraining models
- GB_model = GBest.fit(train_features, train_labels)
- ENST_model = ENSTest.fit(train_features_st, train_labels)
- XGB_model = XGBest.fit(train_features, train_labels)
- ## Getting our SalePrice estimation
- Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st))
- + np.exp(XGB_model.predict(test_features))) / 3
- ## Saving to CSV
- pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('submission-5.csv', index =False)
|