数据可从kaggle上自行下载:
import pandas as pd
import numpy as np
df=pd.read_csv(r"F:\AI人工智能\正式课程\8月30日Day03-决策树与Kaggle项目实践\pubg(竞赛)参考模型\data\train_V2.csv")
df.head()
| Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | revives | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc |
---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.0 | 0 | 0 | 0 | 60 | ... | 0.0 | 0.0000 | 0.0 | 0.00 | 0 | 0.0 | 244.80 | 1 | 1466.0 | 0.4444 |
---|
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0.0 | 0.0045 | 0.0 | 11.04 | 0 | 0.0 | 1434.00 | 5 | 0.0 | 0.6400 |
---|
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.0 | 0 | 0 | 0 | 47 | ... | 0.0 | 0.0000 | 0.0 | 0.00 | 0 | 0.0 | 161.80 | 2 | 0.0 | 0.7755 |
---|
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.9 | 0 | 0 | 0 | 75 | ... | 0.0 | 0.0000 | 0.0 | 0.00 | 0 | 0.0 | 202.70 | 3 | 0.0 | 0.1667 |
---|
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.0 | 0 | 0 | 0 | 45 | ... | 0.0 | 0.0000 | 0.0 | 0.00 | 0 | 0.0 | 49.75 | 2 | 0.0 | 0.1875 |
---|
5 rows × 29 columns
df.info
<bound method DataFrame.info of Id groupId matchId assists boosts \
0 7f96b2f878858a 4d4b580de459be a10357fd1a4a91 0 0
1 eef90569b9d03c 684d5656442f9e aeb375fc57110c 0 0
2 1eaf90ac73de72 6a4a42c3245a74 110163d8bb94ae 1 0
3 4616d365dd2853 a930a9c79cd721 f1f1f4ef412d7e 0 0
4 315c96c26c9aac de04010b3458dd 6dc8ff871e21e6 0 0
... ... ... ... ... ...
4446961 afff7f652dbc10 d238e426f50de7 18492834ce5635 0 0
4446962 f4197cf374e6c0 408cdb5c46b2ac ee854b837376d9 0 1
4446963 e1948b1295c88a e26ac84bdf7cef 6d0cd12784f1ab 0 0
4446964 cc032cdd73b7ac c2223f35411394 c9c701d0ad758a 0 4
4446965 0d8e7ed728b6fd 8c74f72fedf5ff 62a16aabcc095c 0 2 damageDealt DBNOs headshotKills heals killPlace ... revives \
0 0.0 0 0 0 60 ... 0.0
1 91.47 0 0 0 57 ... 0.0
2 68.0 0 0 0 47 ... 0.0
3 32.9 0 0 0 75 ... 0.0
4 100.0 0 0 0 45 ... 0.0
... ... ... ... ... ... ... ...
4446961 0.0 0 0 0 74 ... 0.0
4446962 44.15 0 0 0 69 ... 0.0
4446963 59.06 0 0 0 66 ... 0.0
4446964 180.4 1 1 2 11 ... 2.0
4446965 268.0 0 0 1 18 ... 0.0 rideDistance roadKills swimDistance teamKills vehicleDestroys \
0 0.0000 0.0 0.000 0 0.0
1 0.0045 0.0 11.040 0 0.0
2 0.0000 0.0 0.000 0 0.0
3 0.0000 0.0 0.000 0 0.0
4 0.0000 0.0 0.000 0 0.0
... ... ... ... ... ...
4446961 1292.0000 0.0 0.000 0 0.0
4446962 0.0000 0.0 0.000 0 0.0
4446963 0.0000 0.0 2.184 0 0.0
4446964 0.0000 0.0 0.000 0 0.0
4446965 1369.0000 0.0 0.000 0 0.0 walkDistance weaponsAcquired winPoints winPlacePerc
0 244.80 1 1466.0 0.4444
1 1434.00 5 0.0 0.6400
2 161.80 2 0.0 0.7755
3 202.70 3 0.0 0.1667
4 49.75 2 0.0 0.1875
... ... ... ... ...
4446961 1019.00 3 1507.0 0.1786
4446962 81.70 6 0.0 0.2935
4446963 788.70 4 0.0 0.4815
4446964 2748.00 8 0.0 0.8000
4446965 1244.00 5 0.0 0.5464 [4446966 rows x 29 columns]>
np.unique(df['groupId']).shape
(2026745,)
print(df.isnull().sum())
Id 0
groupId 0
matchId 0
assists 0
boosts 0
damageDealt 0
DBNOs 0
headshotKills 0
heals 0
killPlace 0
killPoints 0
kills 0
killStreaks 0
longestKill 0
matchDuration 0
matchType 0
maxPlace 0
numGroups 0
rankPoints 0
revives 0
rideDistance 0
roadKills 0
swimDistance 0
teamKills 0
vehicleDestroys 0
walkDistance 0
weaponsAcquired 0
winPoints 0
winPlacePerc 1
dtype: int64
df[df['winPlacePerc'].isnull()]
| Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | revives | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc |
---|
2744604 | f70c74418bb064 | 12dfbede33f92b | 224a123c53e008 | 0 | 0 | 0.0 | 0 | 0 | 0 | 1 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | NaN |
---|
1 rows × 29 columns
df.drop(index=2744604,inplace=True)
print(df.isnull().sum())
Id 0
groupId 0
matchId 0
assists 0
boosts 0
damageDealt 0
DBNOs 0
headshotKills 0
heals 0
killPlace 0
killPoints 0
kills 0
killStreaks 0
longestKill 0
matchDuration 0
matchType 0
maxPlace 0
numGroups 0
rankPoints 0
revives 0
rideDistance 0
roadKills 0
swimDistance 0
teamKills 0
vehicleDestroys 0
walkDistance 0
weaponsAcquired 0
winPoints 0
winPlacePerc 0
dtype: int64
print(df.groupby('matchId')['matchId'].count().sort_values())
matchId
e5a77433bc436f 2
8aa5066c4b6a79 5
9f2b49358564ea 6
39d2800dad8ae6 6
bc10cc08f1f56a 7...
51b68a308313f9 100
1f6851edf8ad9e 100
3bd0db1836530f 100
3be2b807005541 100
5ee9860774ef58 100
Name: matchId, Length: 47964, dtype: int64
count = df.groupby('matchId')['matchId'].transform('count')
df['playersJoined']=count
df.head()
| Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc | playersJoined |
---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.00 | 0 | 0 | 0 | 60 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 244.80 | 1 | 1466 | 0.4444 | 96 |
---|
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0.0045 | 0 | 11.04 | 0 | 0 | 1434.00 | 5 | 0 | 0.6400 | 91 |
---|
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.00 | 0 | 0 | 0 | 47 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 161.80 | 2 | 0 | 0.7755 | 98 |
---|
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.90 | 0 | 0 | 0 | 75 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 202.70 | 3 | 0 | 0.1667 | 91 |
---|
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.00 | 0 | 0 | 0 | 45 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 49.75 | 2 | 0 | 0.1875 | 97 |
---|
5 rows × 30 columns
id_num=df[['matchId','playersJoined']]
unique_id_num = id_num.drop_duplicates()
unique_id_num.count()
matchId 47964
playersJoined 47964
dtype: int64
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(20,10))
sns.countplot(unique_id_num['playersJoined'])
plt.show()

plt.figure(figsize=(20,10))
sns.countplot(unique_id_num[unique_id_num['playersJoined']>=75]['playersJoined'])
plt.show()

df['killsNorm'] = df['kills']*((100-df['playersJoined'])/100 + 1)
df['damageDealtNorm'] = df['damageDealt']*((100-df['playersJoined'])/100 + 1)
df['maxPlaceNorm'] = df['maxPlace']*((100-df['playersJoined'])/100 + 1)
df['matchDurationNorm'] = df['matchDuration']*((100-df['playersJoined'])/100 + 1)
df['healsandboosts'] = df['heals'] + df['boosts']
df['totalDistance'] = df['rideDistance'] + df['walkDistance'] + df['swimDistance']
df['killsWithoutMoving'] = ((df['kills']>0) &(df['totalDistance']==0))
df['killsWithoutMoving'].describe()
count 4446965
unique 2
top False
freq 4445430
Name: killsWithoutMoving, dtype: object
df[df['killsWithoutMoving'] == True].shape
(1535, 37)
df.drop(df[df['killsWithoutMoving']==True].index,inplace=True)
df.drop(df[df['roadKills'] > 10].index, inplace=True)
df.shape
(4445426, 37)
df.drop(df[df['kills'] > 30].index, inplace=True)
df['headshot_rate'] = df['headshotKills'] / df['kills']
df['headshot_rate'] = df['headshot_rate'].fillna(0)
plt.figure(figsize=(12,4))
sns.distplot(df['headshot_rate'], bins=10, kde=False)
plt.show()

df.drop(df[(df['headshot_rate'] == 1) & (df['kills'] > 9)].index, inplace=True)
plt.figure(figsize=(12,4))
sns.distplot(df['longestKill'], bins=10, kde=False)
plt.show()

df.drop(df[df['longestKill'] >= 1000].index, inplace=True)
df.drop(df[df['walkDistance'] >= 10000].index, inplace=True)
df.drop(df[df['rideDistance'] >= 20000].index, inplace=True)
df.drop(df[df['swimDistance'] >= 2000].index, inplace=True)
df.drop(df[df['weaponsAcquired'] >= 80].index, inplace=True)
df.drop(df[df['heals'] >= 40].index, inplace=True)
df['matchType'].unique()
array(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo','normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp','flarefpp', 'normal-duo-fpp', 'normal-duo', 'normal-squad','crashtpp', 'normal-solo'], dtype=object)
df = pd.get_dummies(df, columns=['matchType'])
matchType_encoding=df.filter(regex='matchType')
matchType_encoding.head()
| matchType_crashfpp | matchType_crashtpp | matchType_duo | matchType_duo-fpp | matchType_flarefpp | matchType_flaretpp | matchType_normal-duo | matchType_normal-duo-fpp | matchType_normal-solo | matchType_normal-solo-fpp | matchType_normal-squad | matchType_normal-squad-fpp | matchType_solo | matchType_solo-fpp | matchType_squad | matchType_squad-fpp |
---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
---|
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
---|
df['groupId'] = df['groupId'].astype('category')
df["groupId_cat"] = df["groupId"].cat.codes
df['matchId'] = df['matchId'].astype('category')
df["matchId_cat"] = df["matchId"].cat.codes
df["matchId_cat"].head()
0 30085
1 32751
2 3143
3 45260
4 20531
Name: matchId_cat, dtype: int32
df.drop(['groupId', 'matchId'], axis=1, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4444752 entries, 0 to 4446965
Data columns (total 53 columns):# Column Dtype
--- ------ ----- 0 Id object 1 assists int64 2 boosts int64 3 damageDealt float644 DBNOs int64 5 headshotKills int64 6 heals int64 7 killPlace int64 8 killPoints int64 9 kills int64 10 killStreaks int64 11 longestKill float6412 matchDuration int64 13 maxPlace int64 14 numGroups int64 15 rankPoints int64 16 revives int64 17 rideDistance float6418 roadKills int64 19 swimDistance float6420 teamKills int64 21 vehicleDestroys int64 22 walkDistance float6423 weaponsAcquired int64 24 winPoints int64 25 winPlacePerc float6426 playersJoined int64 27 killsNorm float6428 damageDealtNorm float6429 maxPlaceNorm float6430 matchDurationNorm float6431 healsandboosts int64 32 totalDistance float6433 killsWithoutMoving bool 34 headshot_rate float6435 matchType_crashfpp uint8 36 matchType_crashtpp uint8 37 matchType_duo uint8 38 matchType_duo-fpp uint8 39 matchType_flarefpp uint8 40 matchType_flaretpp uint8 41 matchType_normal-duo uint8 42 matchType_normal-duo-fpp uint8 43 matchType_normal-solo uint8 44 matchType_normal-solo-fpp uint8 45 matchType_normal-squad uint8 46 matchType_normal-squad-fpp uint8 47 matchType_solo uint8 48 matchType_solo-fpp uint8 49 matchType_squad uint8 50 matchType_squad-fpp uint8 51 groupId_cat int32 52 matchId_cat int32
dtypes: bool(1), float64(12), int32(2), int64(21), object(1), uint8(16)
memory usage: 1.3+ GB
df_sample=df.sample(1000000)
df_sample.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 3835307 to 3933898
Data columns (total 53 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 Id 1000000 non-null object 1 assists 1000000 non-null int64 2 boosts 1000000 non-null int64 3 damageDealt 1000000 non-null float644 DBNOs 1000000 non-null int64 5 headshotKills 1000000 non-null int64 6 heals 1000000 non-null int64 7 killPlace 1000000 non-null int64 8 killPoints 1000000 non-null int64 9 kills 1000000 non-null int64 10 killStreaks 1000000 non-null int64 11 longestKill 1000000 non-null float6412 matchDuration 1000000 non-null int64 13 maxPlace 1000000 non-null int64 14 numGroups 1000000 non-null int64 15 rankPoints 1000000 non-null int64 16 revives 1000000 non-null int64 17 rideDistance 1000000 non-null float6418 roadKills 1000000 non-null int64 19 swimDistance 1000000 non-null float6420 teamKills 1000000 non-null int64 21 vehicleDestroys 1000000 non-null int64 22 walkDistance 1000000 non-null float6423 weaponsAcquired 1000000 non-null int64 24 winPoints 1000000 non-null int64 25 winPlacePerc 1000000 non-null float6426 playersJoined 1000000 non-null int64 27 killsNorm 1000000 non-null float6428 damageDealtNorm 1000000 non-null float6429 maxPlaceNorm 1000000 non-null float6430 matchDurationNorm 1000000 non-null float6431 healsandboosts 1000000 non-null int64 32 totalDistance 1000000 non-null float6433 killsWithoutMoving 1000000 non-null bool 34 headshot_rate 1000000 non-null float6435 matchType_crashfpp 1000000 non-null uint8 36 matchType_crashtpp 1000000 non-null uint8 37 matchType_duo 1000000 non-null uint8 38 matchType_duo-fpp 1000000 non-null uint8 39 matchType_flarefpp 1000000 non-null uint8 40 matchType_flaretpp 1000000 non-null uint8 41 matchType_normal-duo 1000000 non-null uint8 42 matchType_normal-duo-fpp 1000000 non-null uint8 43 matchType_normal-solo 1000000 non-null uint8 44 matchType_normal-solo-fpp 1000000 non-null uint8 45 matchType_normal-squad 1000000 non-null uint8 46 matchType_normal-squad-fpp 1000000 non-null uint8 47 matchType_solo 1000000 non-null uint8 48 matchType_solo-fpp 1000000 non-null uint8 49 matchType_squad 1000000 non-null uint8 50 matchType_squad-fpp 1000000 non-null uint8 51 groupId_cat 1000000 non-null int32 52 matchId_cat 1000000 non-null int32
dtypes: bool(1), float64(12), int32(2), int64(21), object(1), uint8(16)
memory usage: 290.9+ MB
df_sample.shape
(1000000, 53)
df_sample_x=df_sample.drop(["winPlacePerc", "Id"], axis=1)
df_sample_y=df_sample['winPlacePerc']
df_sample_y.shape
(1000000,)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df_sample_x,df_sample_y,test_size=0.2)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
(800000, 51) (200000, 51) (800000,) (200000,)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
m1 = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features='sqrt',n_jobs=-1)
m1.fit(x_train, y_train)
RandomForestRegressor(max_features='sqrt', min_samples_leaf=3, n_estimators=40,n_jobs=-1)
m1.score(x_test,y_test)
0.921637092838284
y_pre=m1.predict(x_test)
mean_absolute_error(y_true=y_test,y_pred=y_pre)
0.061032362616554967
imp_df_sample_x = pd.DataFrame({"cols":df_sample_x.columns, "imp":m1.feature_importances_})
imp_df_sample_x = imp_df_sample_x.sort_values("imp", ascending=False)
imp_df_sample_x.head(20)
| cols | imp |
---|
21 | walkDistance | 0.259065 |
---|
6 | killPlace | 0.230299 |
---|
30 | totalDistance | 0.168414 |
---|
1 | boosts | 0.058193 |
---|
29 | healsandboosts | 0.049961 |
---|
22 | weaponsAcquired | 0.045347 |
---|
5 | heals | 0.021690 |
---|
16 | rideDistance | 0.021267 |
---|
26 | damageDealtNorm | 0.019354 |
---|
28 | matchDurationNorm | 0.011771 |
---|
10 | longestKill | 0.011535 |
---|
9 | killStreaks | 0.011351 |
---|
11 | matchDuration | 0.010507 |
---|
8 | kills | 0.008333 |
---|
25 | killsNorm | 0.007792 |
---|
13 | numGroups | 0.006870 |
---|
27 | maxPlaceNorm | 0.006784 |
---|
24 | playersJoined | 0.006689 |
---|
12 | maxPlace | 0.006370 |
---|
0 | assists | 0.005943 |
---|
keep=imp_df_sample_x[imp_df_sample_x['imp']>0.005].cols
df_2=df_sample_x[keep]
df_2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 3835307 to 3933898
Data columns (total 21 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 walkDistance 1000000 non-null float641 killPlace 1000000 non-null int64 2 totalDistance 1000000 non-null float643 boosts 1000000 non-null int64 4 healsandboosts 1000000 non-null int64 5 weaponsAcquired 1000000 non-null int64 6 heals 1000000 non-null int64 7 rideDistance 1000000 non-null float648 damageDealtNorm 1000000 non-null float649 matchDurationNorm 1000000 non-null float6410 longestKill 1000000 non-null float6411 killStreaks 1000000 non-null int64 12 matchDuration 1000000 non-null int64 13 kills 1000000 non-null int64 14 killsNorm 1000000 non-null float6415 numGroups 1000000 non-null int64 16 maxPlaceNorm 1000000 non-null float6417 playersJoined 1000000 non-null int64 18 maxPlace 1000000 non-null int64 19 assists 1000000 non-null int64 20 damageDealt 1000000 non-null float64
dtypes: float64(9), int64(12)
memory usage: 167.8 MB
xx_train,xx_test,yy_train,yy_test=train_test_split(df_2,df_sample_y,test_size=0.2)
m2=RandomForestRegressor(n_estimators=40,min_samples_leaf=3, max_features='sqrt',n_jobs=-1)
m2.fit(xx_train,yy_train)
m2.score(xx_test,yy_test)
0.9249575750059011
yy_pre=m2.predict(xx_test)
mean_absolute_error(y_true=yy_test,y_pred=yy_pre)
0.05952061649490702