9-1. 머신러닝으로 FIFA 19 선수 포지션 예측하기_1 [K-NN]

k-NN알고리즘을 사용하여 FIFA 19데이터의 포지션을 예측해본다.

[main code]

머신러닝으로 FIFA 19 선수 포지션 예측하기[K-NN]

import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

문제 정의

FIFA 19의 선수 스텟을 바탕으로, 그 선수의 포지션을 예측하라

데이터 수집 및 전처리

# 데이터를 수집합니다
df = pd.read_csv("../data/fifa_data.csv")
# 수집된 데이터 샘플을 확인합니다
df.head()
Unnamed: 0IDNameAgePhotoNationalityFlagOverallPotentialClub...ComposureMarkingStandingTackleSlidingTackleGKDivingGKHandlingGKKickingGKPositioningGKReflexesRelease Clause
00158023L. Messi31https://cdn.sofifa.org/players/4/19/158023.pngArgentinahttps://cdn.sofifa.org/flags/52.png9494FC Barcelona...96.033.028.026.06.011.015.014.08.0€226.5M
1120801Cristiano Ronaldo33https://cdn.sofifa.org/players/4/19/20801.pngPortugalhttps://cdn.sofifa.org/flags/38.png9494Juventus...95.028.031.023.07.011.015.014.011.0€127.1M
22190871Neymar Jr26https://cdn.sofifa.org/players/4/19/190871.pngBrazilhttps://cdn.sofifa.org/flags/54.png9293Paris Saint-Germain...94.027.024.033.09.09.015.015.011.0€228.1M
33193080De Gea27https://cdn.sofifa.org/players/4/19/193080.pngSpainhttps://cdn.sofifa.org/flags/45.png9193Manchester United...68.015.021.013.090.085.087.088.094.0€138.6M
44192985K. De Bruyne27https://cdn.sofifa.org/players/4/19/192985.pngBelgiumhttps://cdn.sofifa.org/flags/7.png9192Manchester City...88.068.058.051.015.013.05.010.013.0€196.4M

5 rows × 89 columns

df1 = pd.DataFrame({'Name': df.Name,'Club':df.Club,'Position':df.Position,}) #선수정보데이터
df2 = df.iloc[:,54:88].astype(float) #스텟데이터

td = pd.concat([df1,df2],axis=1)
td
NameClubPositionCrossingFinishingHeadingAccuracyShortPassingVolleysDribblingCurve...PenaltiesComposureMarkingStandingTackleSlidingTackleGKDivingGKHandlingGKKickingGKPositioningGKReflexes
0L. MessiFC BarcelonaRF84.095.070.090.086.097.093.0...75.096.033.028.026.06.011.015.014.08.0
1Cristiano RonaldoJuventusST84.094.089.081.087.088.081.0...85.095.028.031.023.07.011.015.014.011.0
2Neymar JrParis Saint-GermainLW79.087.062.084.084.096.088.0...81.094.027.024.033.09.09.015.015.011.0
3De GeaManchester UnitedGK17.013.021.050.013.018.021.0...40.068.015.021.013.090.085.087.088.094.0
4K. De BruyneManchester CityRCM93.082.055.092.082.086.085.0...79.088.068.058.051.015.013.05.010.013.0

18207 rows × 37 columns

# 현재 가지고 있는 데이터에서, 포지션의 갯수를 확인한다
td.Position.value_counts()
ST     2152
GK     2025
CB     1778
CM     1394
LB     1322
RB     1291
RM     1124
LM     1095
CAM     958
CDM     948
RCB     662
LCB     648
LCM     395
RCM     391
LW      381
RW      370
RDM     248
LDM     243
LS      207
RS      203
RWB      87
LWB      78
CF       74
RAM      21
LAM      21
RF       16
LF       15
Name: Position, dtype: int64
  • 비슷한 역할을 하는 포지션끼리는 합쳐서 포지션별 데이터의 수를 늘린다.
td.loc[td['Position']=='LF', ['Position']] = 'ST'
td.loc[td['Position']=='RF', ['Position']] = 'ST'
td.loc[td['Position']=='CF', ['Position']] = 'ST'
td.loc[td['Position']=='LS', ['Position']] = 'ST'
td.loc[td['Position']=='RS', ['Position']] = 'ST'
td.loc[td['Position']=='LAM', ['Position']] = 'CAM'
td.loc[td['Position']=='RAM', ['Position']] = 'CAM'
td.loc[td['Position']=='LCM', ['Position']] = 'CM'
td.loc[td['Position']=='RCM', ['Position']] = 'CM'
td.loc[td['Position']=='RDM', ['Position']] = 'CDM'
td.loc[td['Position']=='LDM', ['Position']] = 'CDM'
td.loc[td['Position']=='LW', ['Position']] = 'WF'
td.loc[td['Position']=='RW', ['Position']] = 'WF'
td.loc[td['Position']=='LB', ['Position']] = 'WB'
td.loc[td['Position']=='RB', ['Position']] = 'WB'
td.loc[td['Position']=='LWB', ['Position']] = 'WB'
td.loc[td['Position']=='RWB', ['Position']] = 'WB'
td.loc[td['Position']=='LM', ['Position']] = 'WM'
td.loc[td['Position']=='RM', ['Position']] = 'WM'
td.loc[td['Position']=='LCB', ['Position']] = 'CB'
td.loc[td['Position']=='RCB', ['Position']] = 'CB'
# 현재 가지고 있는 데이터에서, 포지션의 갯수를 확인한다
td.Position.value_counts()
CB     3088
WB     2778
ST     2667
WM     2219
CM     2180
GK     2025
CDM    1439
CAM    1000
WF      751
Name: Position, dtype: int64
td = td.dropna() #null값 제거

데이터 나누기 (학습 데이터, 테스트 데이터)

# sklearn의 train_test_split을 사용하면 라인 한줄로 손쉽게 데이터를 나눌 수 있다
from sklearn.model_selection import train_test_split

# 다듬어진 데이터에서 20%를 테스트 데이터로 분류합니다
train, test = train_test_split(td, test_size=0.2)
# 학습 데이터의 갯수를 확인합니다, 14565개의 데이터가 있습니다.
train.shape[0]
14334
# 테스트 데이터의 갯수를 확인합니다. 3642개의 데이터가 있습니다.
test.shape[0]
3584

다듬어진 데이터를 파일로 저장하기

다듬어진 데이터를 파일로 저장하여, 머신러닝 분류 알고리즘 실습 시에 사용하도록 하겠습니다.

with open('../data/fifa_train.pkl', 'wb') as train_data:
    pickle.dump(train, train_data)
    
with open('../data/fifa_test.pkl', 'wb') as test_data:
    pickle.dump(test, test_data)

데이터 불러오기 (학습 데이터, 테스트 데이터)

학습 데이터 및 테스트 데이터를 로드합니다.

with open('../data/fifa_train.pkl', 'rb') as train_data:
    train = pickle.load(train_data)
    
with open('../data/fifa_test.pkl', 'rb') as test_data:
    test = pickle.load(test_data)

최적의 k 찾기 (교차 검증 - cross validation)

# import kNN library
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# find best k, range from 3 to half of the number of data
max_k_range = train.shape[0] // 2
k_list = []
for i in range(7, max_k_range, 179):
    k_list.append(i)

cross_validation_scores = []
x_train = train.iloc[:,3:37]
y_train = train[['Position']]

# 10-fold cross validation
for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_train, y_train.values.ravel(), cv=10, scoring='accuracy')
    cross_validation_scores.append(scores.mean())

cross_validation_scores
# visualize accuracy according to k
plt.plot(k_list, cross_validation_scores)
plt.xlabel('the number of k')
plt.ylabel('Accuracy')
plt.show()

png

# find best k
cvs = cross_validation_scores
k = k_list[cvs.index(max(cross_validation_scores))]
print("The best number of k : " + str(k) )
The best number of k : 186
  • 범위를 좁혀서 다시시도한다.
# find best k, range from 3 to half of the number of data
k_list = []
for i in range(7, 367, 9):
    k_list.append(i)

cross_validation_scores2 = []
x_train = train.iloc[:,3:37]
y_train = train[['Position']]

# 10-fold cross validation
for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_train, y_train.values.ravel(), cv=10, scoring='accuracy')
    cross_validation_scores2.append(scores.mean())

cross_validation_scores2
# visualize accuracy according to k
plt.plot(k_list, cross_validation_scores2)
plt.xlabel('the number of k')
plt.ylabel('Accuracy')
plt.show()

png

# find best k
cvs2 = cross_validation_scores2
k = k_list[cvs2.index(max(cross_validation_scores2))]
print("The best number of k : " + str(k) )
The best number of k : 61
# find best k, range from 3 to half of the number of data
k_list = []
for i in range(52, 70):
    k_list.append(i)

cross_validation_scores3 = []
x_train = train.iloc[:,3:37]
y_train = train[['Position']]

# 10-fold cross validation
for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_train, y_train.values.ravel(), cv=10, scoring='accuracy')
    cross_validation_scores3.append(scores.mean())

cross_validation_scores3
# visualize accuracy according to k
plt.plot(k_list, cross_validation_scores3)
plt.xlabel('the number of k')
plt.ylabel('Accuracy')
plt.show()

png

# find best k
cvs3 = cross_validation_scores3
k = k_list[cvs3.index(max(cross_validation_scores3))]
print("The best number of k : " + str(k) )
The best number of k : 60
  • 최적의 k는 60이다.

모델 테스트

# import libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=k)

# select data features
x_train = train.iloc[:,3:37]
# select target value
y_train = train[['Position']]

# setup knn using train data
knn.fit(x_train, y_train.values.ravel())

# select data feature to be used for prediction
x_test = test.iloc[:,3:37]

# select target value
y_test = test[['Position']]

# test
pred = knn.predict(x_test)
# check ground_truth with knn prediction
comparison = pd.DataFrame(
    {'prediction':pred, 'ground_truth':y_test.values.ravel()}) 
comparison
predictionground_truth
0WMCAM
1CMWB
2STWM
3WBWM
4CDMCDM
5CDMCB
6WMCAM
7CBCB
8WBWB
9GKGK
10GKGK

3584 rows × 2 columns

# check accuracy
print("accuracy : "+ 
          str(accuracy_score(y_test.values.ravel(), pred)) )
accuracy : 0.7279575892857143

© 2020. All rights reserved.