@@@ 데이터분석/Kaggle

[Kaggle] Titanic 연습해보기

HTG 2022. 12. 30. 14:52
728x90
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
 
 
from sklearn.preprocessing import LabelEncoder

 

 
import sklearn.preprocessing
dir(sklearn.preprocessing)

 

 

DATA

df = pd.read_csv("/kaggle/input/titanic/train.csv")
# df.dropna(subset=["Embarked"], inplace=True)
X = df.drop("Survived", axis=1)
y = df["Survived"]
print(X,y)
     PassengerId  Pclass                                               Name  \
0              1       3                            Braund, Mr. Owen Harris   
1              2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2              3       3                             Heikkinen, Miss. Laina   
3              4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4              5       3                           Allen, Mr. William Henry   
..           ...     ...                                                ...   
886          887       2                              Montvila, Rev. Juozas   
887          888       1                       Graham, Miss. Margaret Edith   
888          889       3           Johnston, Miss. Catherine Helen "Carrie"   
889          890       1                              Behr, Mr. Karl Howell   
890          891       3                                Dooley, Mr. Patrick   

        Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0      male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1    female  38.0      1      0          PC 17599  71.2833   C85        C  
2    female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3    female  35.0      1      0            113803  53.1000  C123        S  
4      male  35.0      0      0            373450   8.0500   NaN        S  
..      ...   ...    ...    ...               ...      ...   ...      ...  
886    male  27.0      0      0            211536  13.0000   NaN        S  
887  female  19.0      0      0            112053  30.0000   B42        S  
888  female   NaN      1      2        W./C. 6607  23.4500   NaN        S  
889    male  26.0      0      0            111369  30.0000  C148        C  
890    male  32.0      0      0            370376   7.7500   NaN        Q  

[891 rows x 11 columns] 0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

 


EDA

feature mean   type
Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd 등석 구분 category
sex Sex 성별 category
Age Age in years 나이(null 177) 구간 or int
sibsp # of siblings / spouses aboard the Titanic 형제, 배우자 수 int
parch # of parents / children aboard the Titanic 부모, 자녀 수 int
ticket Ticket number 티켓 넘버 삭제?
fare Passenger fare 승선 요금 등석과 상관관계일 것으로 판단 (삭제)
cabin Cabin number 좌석 넘버 삭제 or 관계 파악
embarked Port of Embarkation 승선 장소 삭제 or 관계 파악

 

X.describe(include="all")
 

 

  PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
count 891.000000 891.000000 891 891 714.000000 891.000000 891.000000 891 891.000000 204 889
unique NaN NaN 891 2 NaN NaN NaN 681 NaN 147 3
top NaN NaN Braund, Mr.
Owen Harris
male NaN NaN NaN 347082 NaN B96 B98 S
freq NaN NaN 1 577 NaN NaN NaN 7 NaN 4 644
mean 446.000000 2.308642 NaN NaN 29.699118 0.523008 0.381594 NaN 32.204208 NaN NaN
std 257.353842 0.836071 NaN NaN 14.526497 1.102743 0.806057 NaN 49.693429 NaN NaN
min 1.000000 1.000000 NaN NaN 0.420000 0.000000 0.000000 NaN 0.000000 NaN NaN
25% 223.500000 2.000000 NaN NaN 20.125000 0.000000 0.000000 NaN 7.910400 NaN NaN
50% 446.000000 3.000000 NaN NaN 28.000000 0.000000 0.000000 NaN 14.454200 NaN NaN
75% 668.500000 3.000000 NaN NaN 38.000000 1.000000 0.000000 NaN 31.000000 NaN NaN
max 891.000000 3.000000 NaN NaN 80.000000 8.000000 6.000000 NaN 512.329200 NaN NaN
 
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB

 


Train

Scale

# Age, Embarked null 처리
# X.dropna(subset=["Embarked"], inplace=True)
X["Age"].fillna(int(X['Age'].mean()), inplace=True)
X['Embarked'].fillna('S', inplace=True)
 
# FamilySize 추가
# X['FamilySize'] = X['SibSp'] + X['Parch'] + 1 # 자신을 포함해야하니 1을 더합니다
 
# # Initial을 활용한 Age 채우기
# X['Initial']= X.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
    
# X['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
#                         ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr', 'Mr'],inplace=True)

# X.loc[(X.Age.isnull())&(X.Initial=='Mr'),'Age'] = 33
# X.loc[(X.Age.isnull())&(X.Initial=='Mrs'),'Age'] = 36
# X.loc[(X.Age.isnull())&(X.Initial=='Master'),'Age'] = 5
# X.loc[(X.Age.isnull())&(X.Initial=='Miss'),'Age'] = 22
# X.loc[(X.Age.isnull())&(X.Initial=='Other'),'Age'] = 46
 
# label
labels = ["Pclass", "Sex", "Embarked"]

le_encoder = LabelEncoder()

for i in labels:
    X[i] = le_encoder.fit_transform(X[i])
#     X[i] = X[i].astype('category')
 
# dummy
dummy_col = ['Embarked', 'Pclass']
X = pd.get_dummies(X, columns=dummy_col)
 
# 일단 passengerId, cabin 삭제
drop_label = ["Cabin","PassengerId","Name","Ticket"]

for i in drop_label:
    X = X.drop(i, axis = 1)
 
# MinMaxScale
mm_label = ["Fare"]
 
## MinMax 적용
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

for i in mm_label:
    X[i] = mms.fit_transform(X[[i]])
 
X.describe(include="all")
   
  Sex  Age  SibSp  Parch Fare  Embarked_0  Embarked_1  Embarked_2  Pclass_0  Pclass_1  Pclass_2 
count  891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean  0.647587 29.560236 0.523008 0.381594 0.062858 0.188552 0.086420 0.725028 0.242424 0.206510 0.551066
std  0.477990 13.005010 1.102743 0.806057 0.096995 0.391372 0.281141 0.446751 0.428790 0.405028 0.497665
min   0.000000 0.420000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 22.000000 0.000000 0.000000 0.015440 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 1.000000 29.000000 0.000000 0.000000 0.028213 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000
75% 1.000000 35.000000 1.000000 0.000000 0.060508 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000
max 1.000000 80.000000 8.000000 6.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
 
 
import matplotlib.pyplot as plt
import seaborn as sns
heatmap_data = pd.concat([X,y],axis=1)

colormap = plt.cm.RdBu
plt.figure(figsize=(14, 12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(heatmap_data.astype(float).corr(), linewidths=0.1, vmax=1.0,
           square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={"size": 16})

# del heatmap_data
 
<AxesSubplot:title={'center':'Pearson Correlation of Features'}>

측정

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y)
 
X_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 479 to 740
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Sex         668 non-null    int64  
 1   Age         668 non-null    float64
 2   SibSp       668 non-null    int64  
 3   Parch       668 non-null    int64  
 4   Fare        668 non-null    float64
 5   Embarked_0  668 non-null    uint8  
 6   Embarked_1  668 non-null    uint8  
 7   Embarked_2  668 non-null    uint8  
 8   Pclass_0    668 non-null    uint8  
 9   Pclass_1    668 non-null    uint8  
 10  Pclass_2    668 non-null    uint8  
dtypes: float64(2), int64(3), uint8(6)
memory usage: 35.2 KB
 
from xgboost import XGBClassifier

# 모델 선언 예시
model = XGBClassifier(n_estimators=500, learning_rate=0.2, max_depth=3, random_state = 32, gamma=1)

model.fit(X_train,y_train)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=1, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.2, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=500,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=32,
              reg_alpha=0, reg_lambda=1, ...)

 

pred = model.predict(X_test)
 
from sklearn.metrics import accuracy_score

accuracy_score(pred,y_test)
0.8251121076233184

 

# from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier()

model.fit(X,y)
print(model.score(X,y))
0.8630751964085297

 


Test

Test = pd.read_csv("/kaggle/input/titanic/test.csv")
 
# Test.dropna(subset=["Embarked","Fare"], inplace=True)
# Age, Embarked null 처리
# X.dropna(subset=["Embarked"], inplace=True)
Test["Age"].fillna(int(Test['Age'].mean()), inplace=True)
Test["Fare"].fillna(int(Test['Fare'].mean()), inplace=True)

# Initial 값 활용 Age 채우기
# Test['Initial']= Test.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations


# Test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
#                         ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr', 'Mr'],inplace=True)

# Test.loc[(Test.Age.isnull())&(Test.Initial=='Mr'),'Age'] = 33
# Test.loc[(Test.Age.isnull())&(Test.Initial=='Mrs'),'Age'] = 36
# Test.loc[(Test.Age.isnull())&(Test.Initial=='Master'),'Age'] = 5
# Test.loc[(Test.Age.isnull())&(Test.Initial=='Miss'),'Age'] = 22
# Test.loc[(Test.Age.isnull())&(Test.Initial=='Other'),'Age'] = 46

# Test['FamilySize'] = Test['SibSp'] + Test['Parch'] + 1 # 자신을 포함해야하니 1을 더합니다

# label
labels = ["Pclass", "Sex", "Embarked"]

le_encoder = LabelEncoder()

for i in labels:
    Test[i] = le_encoder.fit_transform(Test[i])
#     Test[i] = Test[i].astype('category')

# dummy화
dummy_col = ['Embarked', 'Pclass']
Test = pd.get_dummies(Test, columns=dummy_col)
    
# 일단 passengerId, cabin 삭제
drop_label = ["Cabin","PassengerId","Name","Ticket"]

for i in drop_label:
    Test = Test.drop(i, axis = 1)

# MinMaxScale
mm_label = ["Fare"]

## MinMax 적용
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

for i in mm_label:
    Test.loc[:,[i]] = mms.fit_transform(Test.loc[:,[i]])
 
Test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Sex         418 non-null    int64  
 1   Age         418 non-null    float64
 2   SibSp       418 non-null    int64  
 3   Parch       418 non-null    int64  
 4   Fare        418 non-null    float64
 5   Embarked_0  418 non-null    uint8  
 6   Embarked_1  418 non-null    uint8  
 7   Embarked_2  418 non-null    uint8  
 8   Pclass_0    418 non-null    uint8  
 9   Pclass_1    418 non-null    uint8  
 10  Pclass_2    418 non-null    uint8  
dtypes: float64(2), int64(3), uint8(6)
memory usage: 18.9 KB

 

test_pred = pd.DataFrame(model.predict(Test))
 

Feature Importance

from pandas import Series

feature_importance = model.feature_importances_
Series_feat_imp = Series(feature_importance, index=Test.columns)
plt.figure(figsize=(8, 8))
Series_feat_imp.sort_values(ascending=True).plot.barh()
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.show()

 


Submission

sub = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
 
sub
Out[36]:
PassengerIdSurvived01234...413414415416417
892 0
893 1
894 0
895 0
896 1
... ...
1305 0
1306 1
1307 0
1308 0
1309 0

418 rows × 2 columns

 
sub['Survived'] = test_pred 

 

 
sub['Survived'].value_counts()

 

 
0    283
1    135
Name: Survived, dtype: int64
 
sub["Survived"].astype(int)
0      0
1      0
2      0
3      0
4      0
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

 

sub
Out[40]:
PassengerIdSurvived01234...413414415416417
892 0
893 0
894 0
895 0
896 0
... ...
1305 0
1306 1
1307 0
1308 0
1309 0

418 rows × 2 columns

 
from pytz import timezone
from datetime import datetime
today = datetime.now(timezone('Asia/Seoul'))
print(today.strftime('%Y.%m.%d - %H:%M:%S'))

date = today.strftime('%m.%d_%H:%M:%S')
2022.12.30 - 14:01:54

 

 
filename = "submission_" + date + ".csv"
sub.to_csv(filename,index=False)