多层感知机实现泰坦尼克号预测
多层感知机实现泰坦尼克号数据预测
Kaggle竞赛-Titanic - Machine Learning from Disaster
import pandas as pd import numpy as np import torch
gender_submission = "/home/dl/gender_submission.csv" train = "/home/dl/train.csv" test = "/home/dl/test.csv"
test_df = pd.read_csv(test)
<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }df = pd.read_csv(train) df
</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>PassengerId</th> <th>Survived</th> <th>Pclass</th> <th>Name</th> <th>Sex</th> <th>Age</th> <th>SibSp</th> <th>Parch</th> <th>Ticket</th> <th>Fare</th> <th>Cabin</th> <th>Embarked</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>1</td> <td>0</td> <td>3</td> <td>Braund, Mr. Owen Harris</td> <td>male</td> <td>22.0</td> <td>1</td> <td>0</td> <td>A/5 21171</td> <td>7.2500</td> <td>NaN</td> <td>S</td> </tr> <tr> <th>1</th> <td>2</td> <td>1</td> <td>1</td> <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td> <td>female</td> <td>38.0</td> <td>1</td> <td>0</td> <td>PC 17599</td> <td>71.2833</td> <td>C85</td> <td>C</td> </tr> <tr> <th>2</th> <td>3</td> <td>1</td> <td>3</td> <td>Heikkinen, Miss. Laina</td> <td>female</td> <td>26.0</td> <td>0</td> <td>0</td> <td>STON/O2. 3101282</td> <td>7.9250</td> <td>NaN</td> <td>S</td> </tr> <tr> <th>3</th> <td>4</td> <td>1</td> <td>1</td> <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td> <td>female</td> <td>35.0</td> <td>1</td> <td>0</td> <td>113803</td> <td>53.1000</td> <td>C123</td> <td>S</td> </tr> <tr> <th>4</th> <td>5</td> <td>0</td> <td>3</td> <td>Allen, Mr. William Henry</td> <td>male</td> <td>35.0</td> <td>0</td> <td>0</td> <td>373450</td> <td>8.0500</td> <td>NaN</td> <td>S</td> </tr> <tr> <th>...</th> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> </tr> <tr> <th>886</th> <td>887</td> <td>0</td> <td>2</td> <td>Montvila, Rev. Juozas</td> <td>male</td> <td>27.0</td> <td>0</td> <td>0</td> <td>211536</td> <td>13.0000</td> <td>NaN</td> <td>S</td> </tr> <tr> <th>887</th> <td>888</td> <td>1</td> <td>1</td> <td>Graham, Miss. Margaret Edith</td> <td>female</td> <td>19.0</td> <td>0</td> <td>0</td> <td>112053</td> <td>30.0000</td> <td>B42</td> <td>S</td> </tr> <tr> <th>888</th> <td>889</td> <td>0</td> <td>3</td> <td>Johnston, Miss. Catherine Helen "Carrie"</td> <td>female</td> <td>NaN</td> <td>1</td> <td>2</td> <td>W./C. 6607</td> <td>23.4500</td> <td>NaN</td> <td>S</td> </tr> <tr> <th>889</th> <td>890</td> <td>1</td> <td>1</td> <td>Behr, Mr. Karl Howell</td> <td>male</td> <td>26.0</td> <td>0</td> <td>0</td> <td>111369</td> <td>30.0000</td> <td>C148</td> <td>C</td> </tr> <tr> <th>890</th> <td>891</td> <td>0</td> <td>3</td> <td>Dooley, Mr. Patrick</td> <td>male</td> <td>32.0</td> <td>0</td> <td>0</td> <td>370376</td> <td>7.7500</td> <td>NaN</td> <td>Q</td> </tr> </tbody> </table> <p>891 rows × 12 columns</p> </div>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
- PassengerId(乘客ID)
- Survived(是否生还)
- Pclass(客舱等级)
- Name(姓名)
- Sex(性别)
- Age(年龄)
- SibSp(兄弟姐妹/配偶数量)
- Parch(父母/子女数量)
- Ticket(船票编号)
- Fare(票价)
- Cabin(舱位号)
- Embarked(登船港口)
from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.model_selection import train_test_split
def transform(df): data = df.copy() # 1. 处理缺失值 data['Age'] = data['Age'].fillna(data['Age'].median()) data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0]) data.drop(columns=['Cabin', 'PassengerId', 'Name', 'Ticket'], inplace = True) # 2. 类别型特征编码 data['Sex'] = LabelEncoder().fit_transform(data['Sex']) # Male=1, Female=0 data['Embarked'] = LabelEncoder().fit_transform(data['Embarked']) # S=2, C=0, Q=1 # 3. 数值特征标准化 scaler = StandardScaler() data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']]) return data.values
X_train = transform(df.drop(columns="Survived")) X_train
array([[ 3. , 1. , -0.56573646, ..., 0. , -0.50244517, 2. ], [ 1. , 0. , 0.66386103, ..., 0. , 0.78684529, 0. ], [ 3. , 0. , -0.25833709, ..., 0. , -0.48885426, 2. ], ..., [ 3. , 0. , -0.1046374 , ..., 2. , -0.17626324, 2. ], [ 1. , 1. , -0.25833709, ..., 0. , -0.04438104, 0. ], [ 3. , 1. , 0.20276197, ..., 0. , -0.49237783, 1. ]], shape=(891, 7))
X_test = transform(test_df) X_test
array([[ 3. , 1. , 0.38623105, ..., 0. , -0.49781052, 1. ], [ 3. , 0. , 1.37137004, ..., 0. , -0.51265996, 2. ], [ 2. , 1. , 2.55353683, ..., 0. , -0.46453181, 1. ], ..., [ 3. , 1. , 0.70147553, ..., 0. , -0.50818292, 2. ], [ 3. , 1. , -0.20485235, ..., 0. , -0.4938564 , 2. ], [ 3. , 1. , -0.20485235, ..., 1. , -0.23762123, 0. ]], shape=(418, 7))
y_train = df["Survived"].values y_train
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0])
X_train_data, X_val_data, y_train_data, y_val_data = train_test_split( X_train, y_train, test_size=0.2, random_state=42 )
class MLP(torch.nn.Module): def __init__(self, input_size, hidden_layers, output_size): super().__init__() layers = [] layer0 = torch.nn.Linear(input_size, hidden_layers[0]) layers.append(layer0) layers.append(torch.nn.ReLU()) layers.append(torch.nn.Dropout(0.3)) for l in range(len(hidden_layers) - 1): layers.append(torch.nn.Linear(hidden_layers[l], hidden_layers[l+1])) layers.append(torch.nn.ReLU()) layers.append(torch.nn.Dropout(0.3)) layers.append(torch.nn.Linear(hidden_layers[-1], output_size)) self.model = torch.nn.Sequential(*layers) def forward(self, X): return self.model(X) def predict(self, X): with torch.no_grad(): outputs = self.model(X) # 使用softmax获取概率 probabilities = torch.nn.functional.softmax(outputs, dim=1) # 获取最大概率的类别 _, predictions = torch.max(probabilities, dim=1) return predictions
X_train_data = torch.from_numpy(X_train_data).to(torch.float32) X_train_data
tensor([[ 1.0000, 1.0000, 1.2402, ..., 0.0000, -0.0746, 2.0000], [ 2.0000, 1.0000, -0.4889, ..., 0.0000, -0.3867, 2.0000], [ 3.0000, 1.0000, 0.2028, ..., 0.0000, -0.4889, 2.0000], ..., [ 3.0000, 1.0000, 0.8944, ..., 0.0000, -0.3644, 2.0000], [ 1.0000, 0.0000, -1.1805, ..., 2.0000, 1.7677, 2.0000], [ 1.0000, 1.0000, -0.6426, ..., 1.0000, 0.9077, 2.0000]])
y_train_data = torch.from_numpy(y_train_data).to(torch.long) y_train_data
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0])
X_val_data = torch.from_numpy(X_val_data).to(torch.float32) y_val_data = torch.from_numpy(y_val_data).to(torch.long)
model = MLP(7,[32, 16], 2)
criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
sum(1 for i,y_pre in enumerate(model.predict(X_val_data)) if y_pre == y_val_data[i])/len(y_val_data)*100
57.54189944134078
# 可以使用cuda加速计算 if torch.cuda.is_available(): model = model.to("cuda") X_train_data = X_train_data.to("cuda") y_train_data = y_train_data.to("cuda") X_val_data = X_val_data.to("cuda") y_val_data = y_val_data Res = torch.from_numpy(X_test).to(torch.float32).to("cuda")
from torch.utils.data import DataLoader, TensorDataset
train_data = TensorDataset(X_train_data, y_train_data) train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
for epoch in range(10): for batch_x, batch_y in train_loader: optimizer.zero_grad() outputs = model(batch_x) loss = criterion(outputs, batch_y) loss.backward() optimizer.step() model.eval()
sum(1 for i,y_pre in enumerate(model.predict(X_val_data)) if y_pre == y_val_data[i])/len(y_val_data)*100
80.44692737430168
y_res = model.predict(Res) y_res
tensor([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0], device='cuda:0')
res = y_res.cpu().numpy() res
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])
<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }test_df["Survived"] = res test_df
</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>PassengerId</th> <th>Pclass</th> <th>Name</th> <th>Sex</th> <th>Age</th> <th>SibSp</th> <th>Parch</th> <th>Ticket</th> <th>Fare</th> <th>Cabin</th> <th>Embarked</th> <th>Survived</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>892</td> <td>3</td> <td>Kelly, Mr. James</td> <td>male</td> <td>34.5</td> <td>0</td> <td>0</td> <td>330911</td> <td>7.8292</td> <td>NaN</td> <td>Q</td> <td>0</td> </tr> <tr> <th>1</th> <td>893</td> <td>3</td> <td>Wilkes, Mrs. James (Ellen Needs)</td> <td>female</td> <td>47.0</td> <td>1</td> <td>0</td> <td>363272</td> <td>7.0000</td> <td>NaN</td> <td>S</td> <td>0</td> </tr> <tr> <th>2</th> <td>894</td> <td>2</td> <td>Myles, Mr. Thomas Francis</td> <td>male</td> <td>62.0</td> <td>0</td> <td>0</td> <td>240276</td> <td>9.6875</td> <td>NaN</td> <td>Q</td> <td>0</td> </tr> <tr> <th>3</th> <td>895</td> <td>3</td> <td>Wirz, Mr. Albert</td> <td>male</td> <td>27.0</td> <td>0</td> <td>0</td> <td>315154</td> <td>8.6625</td> <td>NaN</td> <td>S</td> <td>0</td> </tr> <tr> <th>4</th> <td>896</td> <td>3</td> <td>Hirvonen, Mrs. Alexander (Helga E Lindqvist)</td> <td>female</td> <td>22.0</td> <td>1</td> <td>1</td> <td>3101298</td> <td>12.2875</td> <td>NaN</td> <td>S</td> <td>1</td> </tr> <tr> <th>...</th> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> </tr> <tr> <th>413</th> <td>1305</td> <td>3</td> <td>Spector, Mr. Woolf</td> <td>male</td> <td>NaN</td> <td>0</td> <td>0</td> <td>A.5. 3236</td> <td>8.0500</td> <td>NaN</td> <td>S</td> <td>0</td> </tr> <tr> <th>414</th> <td>1306</td> <td>1</td> <td>Oliva y Ocana, Dona. Fermina</td> <td>female</td> <td>39.0</td> <td>0</td> <td>0</td> <td>PC 17758</td> <td>108.9000</td> <td>C105</td> <td>C</td> <td>1</td> </tr> <tr> <th>415</th> <td>1307</td> <td>3</td> <td>Saether, Mr. Simon Sivertsen</td> <td>male</td> <td>38.5</td> <td>0</td> <td>0</td> <td>SOTON/O.Q. 3101262</td> <td>7.2500</td> <td>NaN</td> <td>S</td> <td>0</td> </tr> <tr> <th>416</th> <td>1308</td> <td>3</td> <td>Ware, Mr. Frederick</td> <td>male</td> <td>NaN</td> <td>0</td> <td>0</td> <td>359309</td> <td>8.0500</td> <td>NaN</td> <td>S</td> <td>0</td> </tr> <tr> <th>417</th> <td>1309</td> <td>3</td> <td>Peter, Master. Michael J</td> <td>male</td> <td>NaN</td> <td>1</td> <td>1</td> <td>2668</td> <td>22.3583</td> <td>NaN</td> <td>C</td> <td>0</td> </tr> </tbody> </table> <p>418 rows × 12 columns</p> </div>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
test_df[["PassengerId", "Survived"]].to_csv("/home/dl/predict.csv",index=False)
提高泰坦尼克号数据集预测准确率的建议
现在MLP模型在泰坦尼克号数据集上达到70-80%的准确率,还有提升空间。以下是几个改进建议:
1. 数据预处理改进
# 添加更全面的数据预处理 from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline # 数值特征和分类特征分开处理 numeric_features = ['age', 'fare'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['sex', 'embarked', 'pclass'] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)])
2. 模型架构优化
class ImprovedMLP(torch.nn.Module): def __init__(self, input_size, hidden_layers, output_size): super().__init__() layers = [] # 添加BatchNorm层 layers.append(torch.nn.BatchNorm1d(input_size)) # 第一层 layers.append(torch.nn.Linear(input_size, hidden_layers[0])) layers.append(torch.nn.BatchNorm1d(hidden_layers[0])) layers.append(torch.nn.LeakyReLU()) # 改用LeakyReLU layers.append(torch.nn.Dropout(0.5)) # 增加dropout率 # 中间层 for l in range(len(hidden_layers) - 1): layers.append(torch.nn.Linear(hidden_layers[l], hidden_layers[l+1])) layers.append(torch.nn.BatchNorm1d(hidden_layers[l+1])) layers.append(torch.nn.LeakyReLU()) layers.append(torch.nn.Dropout(0.5)) # 输出层 layers.append(torch.nn.Linear(hidden_layers[-1], output_size)) self.model = torch.nn.Sequential(*layers)
3. 训练过程优化
# 使用更合适的损失函数和优化器 model = ImprovedMLP(input_size, hidden_layers, output_size) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4) # 添加权重衰减 # 添加学习率调度器 scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5) # 训练循环中添加早停机制 best_val_loss = float('inf') patience = 10 counter = 0 for epoch in range(100): # 训练和验证代码... # 早停检查 if val_loss < best_val_loss: best_val_loss = val_loss counter = 0 torch.save(model.state_dict(), 'best_model.pth') else: counter += 1 if counter >= patience: print("Early stopping") break
4. 其他改进建议
-
特征工程:
- 创建新特征如家庭大小(兄弟姐妹+父母子女)
- 对票价取对数处理
- 将年龄分箱处理
-
模型集成:
- 尝试使用多个模型的预测结果进行投票
- 或者使用bagging/boosting方法
-
超参数调优:
- 使用网格搜索或贝叶斯优化寻找最佳超参数
- 调整隐藏层大小和数量
- 尝试不同的dropout率
-
交叉验证:
- 使用k折交叉验证确保模型稳定性
-
类别平衡:
- 如果数据不平衡,可以使用加权损失函数或过采样/欠采样技术
通过综合应用这些改进方法,模型准确率应该能够提升到80-85%甚至更高。