728x90
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from sklearn.preprocessing import LabelEncoder
import sklearn.preprocessing
dir(sklearn.preprocessing)
DATA
df = pd.read_csv("/kaggle/input/titanic/train.csv")
# df.dropna(subset=["Embarked"], inplace=True)
X = df.drop("Survived", axis=1)
y = df["Survived"]
print(X,y)
PassengerId Pclass Name \
0 1 3 Braund, Mr. Owen Harris
1 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th...
2 3 3 Heikkinen, Miss. Laina
3 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel)
4 5 3 Allen, Mr. William Henry
.. ... ... ...
886 887 2 Montvila, Rev. Juozas
887 888 1 Graham, Miss. Margaret Edith
888 889 3 Johnston, Miss. Catherine Helen "Carrie"
889 890 1 Behr, Mr. Karl Howell
890 891 3 Dooley, Mr. Patrick
Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 male 22.0 1 0 A/5 21171 7.2500 NaN S
1 female 38.0 1 0 PC 17599 71.2833 C85 C
2 female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 female 35.0 1 0 113803 53.1000 C123 S
4 male 35.0 0 0 373450 8.0500 NaN S
.. ... ... ... ... ... ... ... ...
886 male 27.0 0 0 211536 13.0000 NaN S
887 female 19.0 0 0 112053 30.0000 B42 S
888 female NaN 1 2 W./C. 6607 23.4500 NaN S
889 male 26.0 0 0 111369 30.0000 C148 C
890 male 32.0 0 0 370376 7.7500 NaN Q
[891 rows x 11 columns] 0 0
1 1
2 1
3 1
4 0
..
886 0
887 1
888 0
889 1
890 0
Name: Survived, Length: 891, dtype: int64
EDA
feature | mean | type | |
Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd | 등석 구분 | category |
sex | Sex | 성별 | category |
Age | Age in years | 나이(null 177) | 구간 or int |
sibsp | # of siblings / spouses aboard the Titanic | 형제, 배우자 수 | int |
parch | # of parents / children aboard the Titanic | 부모, 자녀 수 | int |
ticket | Ticket number | 티켓 넘버 | 삭제? |
fare | Passenger fare | 승선 요금 | 등석과 상관관계일 것으로 판단 (삭제) |
cabin | Cabin number | 좌석 넘버 | 삭제 or 관계 파악 |
embarked | Port of Embarkation | 승선 장소 | 삭제 or 관계 파악 |
X.describe(include="all")
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
count | 891.000000 | 891.000000 | 891 | 891 | 714.000000 | 891.000000 | 891.000000 | 891 | 891.000000 | 204 | 889 |
unique | NaN | NaN | 891 | 2 | NaN | NaN | NaN | 681 | NaN | 147 | 3 |
top | NaN | NaN | Braund, Mr. Owen Harris |
male | NaN | NaN | NaN | 347082 | NaN | B96 B98 | S |
freq | NaN | NaN | 1 | 577 | NaN | NaN | NaN | 7 | NaN | 4 | 644 |
mean | 446.000000 | 2.308642 | NaN | NaN | 29.699118 | 0.523008 | 0.381594 | NaN | 32.204208 | NaN | NaN |
std | 257.353842 | 0.836071 | NaN | NaN | 14.526497 | 1.102743 | 0.806057 | NaN | 49.693429 | NaN | NaN |
min | 1.000000 | 1.000000 | NaN | NaN | 0.420000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | NaN |
25% | 223.500000 | 2.000000 | NaN | NaN | 20.125000 | 0.000000 | 0.000000 | NaN | 7.910400 | NaN | NaN |
50% | 446.000000 | 3.000000 | NaN | NaN | 28.000000 | 0.000000 | 0.000000 | NaN | 14.454200 | NaN | NaN |
75% | 668.500000 | 3.000000 | NaN | NaN | 38.000000 | 1.000000 | 0.000000 | NaN | 31.000000 | NaN | NaN |
max | 891.000000 | 3.000000 | NaN | NaN | 80.000000 | 8.000000 | 6.000000 | NaN | 512.329200 | NaN | NaN |
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Pclass 891 non-null int64
2 Name 891 non-null object
3 Sex 891 non-null object
4 Age 714 non-null float64
5 SibSp 891 non-null int64
6 Parch 891 non-null int64
7 Ticket 891 non-null object
8 Fare 891 non-null float64
9 Cabin 204 non-null object
10 Embarked 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB
Train
Scale
# Age, Embarked null 처리
# X.dropna(subset=["Embarked"], inplace=True)
X["Age"].fillna(int(X['Age'].mean()), inplace=True)
X['Embarked'].fillna('S', inplace=True)
# FamilySize 추가
# X['FamilySize'] = X['SibSp'] + X['Parch'] + 1 # 자신을 포함해야하니 1을 더합니다
# # Initial을 활용한 Age 채우기
# X['Initial']= X.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
# X['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
# ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr', 'Mr'],inplace=True)
# X.loc[(X.Age.isnull())&(X.Initial=='Mr'),'Age'] = 33
# X.loc[(X.Age.isnull())&(X.Initial=='Mrs'),'Age'] = 36
# X.loc[(X.Age.isnull())&(X.Initial=='Master'),'Age'] = 5
# X.loc[(X.Age.isnull())&(X.Initial=='Miss'),'Age'] = 22
# X.loc[(X.Age.isnull())&(X.Initial=='Other'),'Age'] = 46
# label
labels = ["Pclass", "Sex", "Embarked"]
le_encoder = LabelEncoder()
for i in labels:
X[i] = le_encoder.fit_transform(X[i])
# X[i] = X[i].astype('category')
# dummy
dummy_col = ['Embarked', 'Pclass']
X = pd.get_dummies(X, columns=dummy_col)
# 일단 passengerId, cabin 삭제
drop_label = ["Cabin","PassengerId","Name","Ticket"]
for i in drop_label:
X = X.drop(i, axis = 1)
# MinMaxScale
mm_label = ["Fare"]
## MinMax 적용
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
for i in mm_label:
X[i] = mms.fit_transform(X[[i]])
X.describe(include="all")
Sex | Age | SibSp | Parch | Fare | Embarked_0 | Embarked_1 | Embarked_2 | Pclass_0 | Pclass_1 | Pclass_2 | |
count | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 0.647587 | 29.560236 | 0.523008 | 0.381594 | 0.062858 | 0.188552 | 0.086420 | 0.725028 | 0.242424 | 0.206510 | 0.551066 |
std | 0.477990 | 13.005010 | 1.102743 | 0.806057 | 0.096995 | 0.391372 | 0.281141 | 0.446751 | 0.428790 | 0.405028 | 0.497665 |
min | 0.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 22.000000 | 0.000000 | 0.000000 | 0.015440 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 1.000000 | 29.000000 | 0.000000 | 0.000000 | 0.028213 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
75% | 1.000000 | 35.000000 | 1.000000 | 0.000000 | 0.060508 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
max | 1.000000 | 80.000000 | 8.000000 | 6.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
import matplotlib.pyplot as plt
import seaborn as sns
heatmap_data = pd.concat([X,y],axis=1)
colormap = plt.cm.RdBu
plt.figure(figsize=(14, 12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(heatmap_data.astype(float).corr(), linewidths=0.1, vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={"size": 16})
# del heatmap_data
<AxesSubplot:title={'center':'Pearson Correlation of Features'}>
측정
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)
X_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 479 to 740
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Sex 668 non-null int64
1 Age 668 non-null float64
2 SibSp 668 non-null int64
3 Parch 668 non-null int64
4 Fare 668 non-null float64
5 Embarked_0 668 non-null uint8
6 Embarked_1 668 non-null uint8
7 Embarked_2 668 non-null uint8
8 Pclass_0 668 non-null uint8
9 Pclass_1 668 non-null uint8
10 Pclass_2 668 non-null uint8
dtypes: float64(2), int64(3), uint8(6)
memory usage: 35.2 KB
from xgboost import XGBClassifier
# 모델 선언 예시
model = XGBClassifier(n_estimators=500, learning_rate=0.2, max_depth=3, random_state = 32, gamma=1)
model.fit(X_train,y_train)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=1, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.2, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=500,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=32,
reg_alpha=0, reg_lambda=1, ...)
pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(pred,y_test)
0.8251121076233184
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
model.fit(X,y)
print(model.score(X,y))
0.8630751964085297
Test
Test = pd.read_csv("/kaggle/input/titanic/test.csv")
# Test.dropna(subset=["Embarked","Fare"], inplace=True)
# Age, Embarked null 처리
# X.dropna(subset=["Embarked"], inplace=True)
Test["Age"].fillna(int(Test['Age'].mean()), inplace=True)
Test["Fare"].fillna(int(Test['Fare'].mean()), inplace=True)
# Initial 값 활용 Age 채우기
# Test['Initial']= Test.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
# Test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
# ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr', 'Mr'],inplace=True)
# Test.loc[(Test.Age.isnull())&(Test.Initial=='Mr'),'Age'] = 33
# Test.loc[(Test.Age.isnull())&(Test.Initial=='Mrs'),'Age'] = 36
# Test.loc[(Test.Age.isnull())&(Test.Initial=='Master'),'Age'] = 5
# Test.loc[(Test.Age.isnull())&(Test.Initial=='Miss'),'Age'] = 22
# Test.loc[(Test.Age.isnull())&(Test.Initial=='Other'),'Age'] = 46
# Test['FamilySize'] = Test['SibSp'] + Test['Parch'] + 1 # 자신을 포함해야하니 1을 더합니다
# label
labels = ["Pclass", "Sex", "Embarked"]
le_encoder = LabelEncoder()
for i in labels:
Test[i] = le_encoder.fit_transform(Test[i])
# Test[i] = Test[i].astype('category')
# dummy화
dummy_col = ['Embarked', 'Pclass']
Test = pd.get_dummies(Test, columns=dummy_col)
# 일단 passengerId, cabin 삭제
drop_label = ["Cabin","PassengerId","Name","Ticket"]
for i in drop_label:
Test = Test.drop(i, axis = 1)
# MinMaxScale
mm_label = ["Fare"]
## MinMax 적용
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
for i in mm_label:
Test.loc[:,[i]] = mms.fit_transform(Test.loc[:,[i]])
Test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Sex 418 non-null int64
1 Age 418 non-null float64
2 SibSp 418 non-null int64
3 Parch 418 non-null int64
4 Fare 418 non-null float64
5 Embarked_0 418 non-null uint8
6 Embarked_1 418 non-null uint8
7 Embarked_2 418 non-null uint8
8 Pclass_0 418 non-null uint8
9 Pclass_1 418 non-null uint8
10 Pclass_2 418 non-null uint8
dtypes: float64(2), int64(3), uint8(6)
memory usage: 18.9 KB
test_pred = pd.DataFrame(model.predict(Test))
Feature Importance
from pandas import Series
feature_importance = model.feature_importances_
Series_feat_imp = Series(feature_importance, index=Test.columns)
plt.figure(figsize=(8, 8))
Series_feat_imp.sort_values(ascending=True).plot.barh()
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.show()
Submission
sub = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
sub
Out[36]:
PassengerIdSurvived01234...413414415416417
892 | 0 |
893 | 1 |
894 | 0 |
895 | 0 |
896 | 1 |
... | ... |
1305 | 0 |
1306 | 1 |
1307 | 0 |
1308 | 0 |
1309 | 0 |
418 rows × 2 columns
sub['Survived'] = test_pred
sub['Survived'].value_counts()
0 283
1 135
Name: Survived, dtype: int64
sub["Survived"].astype(int)
0 0
1 0
2 0
3 0
4 0
..
413 0
414 1
415 0
416 0
417 0
Name: Survived, Length: 418, dtype: int64
sub
Out[40]:
PassengerIdSurvived01234...413414415416417
892 | 0 |
893 | 0 |
894 | 0 |
895 | 0 |
896 | 0 |
... | ... |
1305 | 0 |
1306 | 1 |
1307 | 0 |
1308 | 0 |
1309 | 0 |
418 rows × 2 columns
from pytz import timezone
from datetime import datetime
today = datetime.now(timezone('Asia/Seoul'))
print(today.strftime('%Y.%m.%d - %H:%M:%S'))
date = today.strftime('%m.%d_%H:%M:%S')
2022.12.30 - 14:01:54
filename = "submission_" + date + ".csv"
sub.to_csv(filename,index=False)
'@@@ 데이터분석 > Kaggle' 카테고리의 다른 글
[Kaggle] XGBoost 알아보기 (0) | 2023.01.05 |
---|---|
[Kaggle] Titanic 성능 향상(23.01.04) (0) | 2023.01.04 |
[Kaggle] Kaggel 자주 사용하는 함수 (0) | 2022.12.27 |
[Kaggle] 분류 문제 - Titanic - Machine Learning from Disaster (2) (0) | 2022.12.26 |
[Kaggle] 분류 문제 - Titanic - Machine Learning from Disaster (1) (0) | 2022.12.26 |