This is a copy of post on kaggle.com I published 4 years ago, it was the first time I used keras framework on kaggle.com platform for Machine Learning research, kaggle.com has some interesting ML matches with 1M dollar awards.
Original post: https://www.kaggle.com/simonxflu/deep-learning-from-titanic-disaster
import pandas as pdimport numpy as npfrom sklearn.preprocessing import RobustScalerimport kerasfrom keras import backend as Kfrom keras.models import Sequentialfrom keras.models import Modelfrom core import Dense, Dropout, Activation, Reshapefrom keras.layers import Inputfrom keras.layers.normalization import BatchNormalizationfrom keras.optimizers import Adamimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inline# fix random seed for reproducibilityRANDOM_STATE = 7np.random.seed(RANDOM_STATE) [16]:
train = pd.read_csv('../input/train.csv',dtype={'Age': np.float32, 'Fare': np.float32})# get rid of the useless colstrain.drop(['PassengerId', 'Ticket'], axis=1, inplace=True)train.info()train.head()[16]:
[17]:
#age feature research#dealing the missing age of people whose title is MastermasterAgeMean = train.loc[(train['Age'].isnull()==False) & (train['Name'].str.contains(', Master. ')==True), "Age"].mean()train.loc[(train['Age'].isnull()==True) & (train['Name'].str.contains(', Master. ')==True), "Age"] = masterAgeMeantrain.loc[train['Name'].str.contains(', Master. ')==True]nan_num = train['Age'].isnull().sum()# there are 173 other missing ages, fill with random intage_mean = train['Age'].mean()age_std = train['Age'].std()filling = np.random.randint(age_mean-age_std, age_mean+age_std, size=nan_num)train.loc[train['Age'].isnull(), 'Age'] = fillingnan_num = train['Age'].isnull().sum()#look into the age cols = sns.FacetGrid(train,hue='Survived',aspect=3)s.map(sns.kdeplot,'Age',shade=True)s.set(xlim=(0,train['Age'].max()))s.add_legend()train.drop(['Name'], axis=1, inplace=True) [18]:
# Combine Sibsp and Parch features to Family feature# checkprint(train['SibSp'].value_counts(dropna=False))print(train['Parch'].value_counts(dropna=False))sns.factorplot('SibSp','Survived',data=train,size=5)sns.factorplot('Parch','Survived',data=train,size=5)'''through the plot, we suggest that with more family member, the survival rate will drop, we can create the new coladd up the parch and sibsp to check our theory''' train['Family'] = train['SibSp'] + train['Parch']sns.factorplot('Family','Survived',data=train,size=5)train.drop(['SibSp','Parch'],axis=1,inplace=True) [19]:
# fare researchtrain.Fare.isnull().sum()sns.factorplot('Survived','Fare',data=train,size=5)#according to the plot, smaller fare has higher survival rate[19]:
[20]:
#Cabin feature research# checking missing val, 687 out of 891 are missing, drop this coltrain.Cabin.value_counts(dropna=False)train.drop('Cabin',axis=1,inplace=True) [21]:
#Embark feature search# 2 missing valuetrain.Embarked.value_counts(dropna=False)# fill the majority val,'s', into missing val coltrain['Embarked'].fillna('S',inplace=True)sns.factorplot('Embarked','Survived',data=train,size=6)# c has higher survival rate[21]:
[22]:
# Define a few feature preparation helpersdef normalize(series): min = series.min() max = series.max() if (min==max): print(series.name + ' has only one value and should be removed.') scaler = RobustScaler() x = scaler.fit_transform(series.values.reshape(-1,1)).reshape(-1) series.update(pd.Series(x)) return scalerdef transform(series, scaler): x = scaler.transform(series.values.reshape(-1,1)).reshape(-1) series.update(pd.Series(x))def encode_one_hot(df, column, axis=1): x = df.join(pd.get_dummies(df[column], prefix=column, sparse=True)) x.drop(column, axis=axis, inplace=True) return xdef FeaturesImportances(X_train, y_train, featureNames): from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectFromModel rfr = RandomForestClassifier() rfr.fit(X_train, y_train) sfm = SelectFromModel(rfr, prefit=True, threshold=0) selected = sfm.get_support() names = featureNames[selected] scores = rfr.feature_importances_[selected] importances = pd.DataFrame({'feature':names,'importance':np.round(scores,5)}) importances = importances.sort_values('importance',ascending=False).set_index('feature') #importances.to_csv("importances.csv") #print("Selected {} features".format(len(names))) #print(importances) importances.plot.bar() #plt.show() return sfm [23]:
#Encoding training featuresprint("Encoding Pclass categorical features...")train = encode_one_hot(train, 'Pclass')print("Encoding Sex categorical features...")train = encode_one_hot(train, 'Sex')print("Encoding Embarked categorical features...")train = encode_one_hot(train, 'Embarked')#scale only numeric featuresprint("scaling numeric features...")scaler = RobustScaler()ageScaler = normalize(train["Age"])fareScaler = normalize(train["Fare"])familyScaler = normalize(train["Family"])y = train['Survived']train.drop('Survived',axis=1, inplace=True)X = trainX.head()[23]:
[24]:
# Check feature importances, I keep all the features in X because it has only# a few feautures.FeaturesImportances(X, y, X.columns.values)[24]:
[25]:
'''#build deep neural network model with batch normalization layersdef build_model(X_train): model = Sequential() model.add(Dense(256, use_bias=False, input_dim=X_train.shape[1])) model.add(BatchNormalization()); model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(128, use_bias=False)) model.add(BatchNormalization()); model.add(Activation('relu')) model.add(Dropout(0.5)) #model.add(Dense(64, use_bias=False)) #model.add(BatchNormalization()); #model.add(Activation('relu')) #model.add(Dropout(0.5)) model.add(Dense(1, activation='sigmoid')) adam = Adam(lr=0.05,) model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy']) return modelmodel = build_model(X)'''[25]:
[26]:
def DenseNet(X_train): ip = Input(shape=(X_train.shape[1],)) x_list = [ip] x = Dense(128, use_bias=False)(ip) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dropout(0.5)(x) x_list.append(x) x = keras.layers.concatenate(x_list) x = Dense(128, use_bias=False)(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dropout(0.5)(x) x_list.append(x) x = keras.layers.concatenate(x_list) x = Dense(64, use_bias=False)(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dropout(0.5)(x) x_list.append(x) x = keras.layers.concatenate(x_list) x = Dense(64, use_bias=False)(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dropout(0.5)(x) x_list.append(x) x = keras.layers.concatenate(x_list) x = Dense(32, use_bias=False)(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dropout(0.5)(x) x_list.append(x) x = keras.layers.concatenate(x_list) x = Dense(32, use_bias=False)(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dropout(0.5)(x) x_list.append(x) x = keras.layers.concatenate(x_list) x = Dense(16, use_bias=False)(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dropout(0.5)(x) x_list.append(x) x = keras.layers.concatenate(x_list) x = Dense(16, use_bias=False)(ip) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dropout(0.5)(x) op = Dense(1, activation='sigmoid')(x) model = Model(inputs=ip, outputs=op) adam = Adam(lr=0.05,) model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy']) return modelmodel = DenseNet(X) [27]:
# Training plotsfrom IPython import displayplt.rcParams['figure.figsize'] = (10, 10)class PlotTraining(keras.callbacks.Callback): def on_train_begin(self, logs={}): self.i = 0 self.x = [] self.losses = [] self.val_losses = [] self.accs = [] self.val_accs = [] f, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True) self.fig = f self.ax1 = ax1 self.ax2 = ax2 def on_epoch_end(self, epoch, logs={}): if (self.i%100==0): self.x.append(self.i) self.losses.append(logs.get('loss')) self.val_losses.append(logs.get('val_loss')) self.accs.append(logs.get('acc')) self.val_accs.append(logs.get('val_acc')) display.clear_output(wait=True) self.ax1.clear() self.ax1.plot(self.x, self.losses, label="Train") self.ax1.plot(self.x, self.val_losses, label="Validation") self.ax1.set_ylabel('Loss') self.ax1.legend() self.ax2.clear() self.ax2.plot(self.x, self.accs, label="Train") self.ax2.plot(self.x, self.val_accs, label="Validation") self.ax2.set_ylabel('Accuracy') self.ax2.set_xlabel('Epoch') self.ax2.legend() display.display(plt.gcf()) self.i += 1trainCallback = PlotTraining() [28]:
# Train the model with 40000 epochsEPOCHS = 40000BATCH_SIZE = 64print("Training..., it may take 1 hour or 2.")model.fit(X.as_matrix(), y.values, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0, validation_split=0.3, callbacks=[trainCallback])# evaluate the modelscores = model.evaluate(X.as_matrix(), y.values, verbose=0)print("%s: %.3f%%" % (model.metrics_names[1], scores[1]*100)) [29]:
# Predicting with test datatest = pd.read_csv('../input/test.csv',dtype={'Age': np.float32,'Fare': np.float32})# dealing the missing age# missing age of people whose title is Mastertest.loc[(test['Age'].isnull()==True) & (test['Name'].str.contains(', Master. ')==True), "Age"] = masterAgeMeantest.loc[test['Name'].str.contains(', Master. ')==True]nan_num = test['Age'].isnull().sum()# 86 nullage_mean = test['Age'].mean()age_std = test['Age'].std()filling = np.random.randint(age_mean-age_std, age_mean+age_std, size=nan_num)test.loc[test['Age'].isnull()==True, 'Age'] = fillingnan_num = test['Age'].isnull().sum()#dealing missing faretest['Fare'].fillna(test['Fare'].median(), inplace=True)#create Family featuretest['Family'] = test['SibSp'] + test['Parch']test.drop(['SibSp','Parch'],axis=1,inplace=True)test.drop('Cabin',axis=1,inplace=True)#dealing missing Embarkedtest['Embarked'].fillna('S',inplace=True)# encoding test featurestest = encode_one_hot(test, 'Pclass')test = encode_one_hot(test, 'Sex')test = encode_one_hot(test, 'Embarked')transform(test["Age"], ageScaler)transform(test["Fare"], fareScaler)transform(test["Family"], familyScaler)X_test = test.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=False)# Make sure X_test and X have the same dimentions in same sequenc so that X_test fits the input of modelX.head()X_test.head()y_pred = model.predict(X_test.as_matrix())y_pred = (y_pred > 0.5).astype('int32') [30]:
#submitsubmission = pd.DataFrame({ "PassengerId": test["PassengerId"], "Survived": y_pred.reshape(-1) })print(submission['Survived'].value_counts(dropna=False))submission.to_csv("prediction.csv", index=False)print("Submitted.")
No comments:
Post a Comment