티스토리 뷰

카테고리 없음

XGBOOST

최영민85 2018. 8. 10. 22:49

Boosting

ㅇ Boosting: try to fit the data by using multiple simpler models or so called base learner/weak learner.

ㅇ Bagging(Random Forest)과 Boosting의 차이

- 공통점: Get N learners from 1 learner

- 차이점: Bagging은 독립적인 N개의 모델, boosting은 앞 모델의 보완을 위한 새로운 모델을 더함

https://quantdare.com/what-is-the-difference-between-bagging-and-boosting/

- Boosting은 ensemble model의 일종임

››- Adaptive basis function model로 sequentially하게 model을 더해기존 모델의 성능을 향상 시킴

- m번째 모델 는 m-1까지의 모델 과 의 합으로 이루어지며 는 Loss functoin을 최소화 하는 값으로 추정함

XGBOOST

- 최종 Goal은 Loss function(l)을 최소화 시키는 f를 찾는 것이며, 과적합을 막기 위해 Regularization Term을 추가

- 위의 Obj함수를 최소화하기 위해 Adaptive basis function model를 sequentially하게 tree model을 더해 기존 모델의 성능을 향상 시킴

- Constant Prediction으로 시작하여, 이전 round 모델에 new function(f)를 더해 나가게 되며 각 단계에서 obj를 최소화 하는 f를 찾게 됨

- XGBOOST의 핵심은 각 Obj를 Taylor 전개를 통해 변형하여, f를 이전 단계까지의 모델 y(t-1) hat의 함수인 g와 h의 조합으로 가능하게 됨

- 자세한 구현과정은 다음과 같음

1. Initialize

=>

2. For m=1 to M :

3. 4

5. Determine the structure by selecting splits which maximize

Gain은 Tree가 Split 되기 전 후의 Obj(loss)의 차이다.

Gain이 특정 값 이상을 경우 Split을 진행 함

Split 대상 노드가 아닌경우(T2, T3, T4)는 동일 하므로, Split대상 노드(T1)과 Split 된 후의

노드(T1L, T2L)의 loss만 비교하면 된다.

이렇게 구한 gain은 variable importance을 계산하는 값이 된다.

6. Determine the leaf weights for the learnt structure by

7.

- 6번에서 각노드마다 를 계산하고 f(m-1)과 더하여 f(m)을 계산

8. 9. end

output :

Reference

- https://medium.com/syncedreview/tree-boosting-with-xgboost-why-does-xgboost-win-every-machine-learning-competition-ca8034c0b283

- https://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf

- http://www.saedsayad.com/docs/xgboost.pdf

titanic3.csv

path <- 'https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/titanic_csv.csv'
titanic <-read.csv(path)
head(titanic)

str(titanic)

#install.packages("dplyr")
library(dplyr)
# Drop variables
clean_titanic <- titanic %>%
select(-c(home.dest, cabin, name, X, ticket)) %>%
#Convert to factor level
mutate(pclass = factor(pclass, levels = c(1, 2, 3), labels = c('Upper', 'Middle', 'Lower')),
survived = factor(survived, levels = c(0, 1), labels = c('No', 'Yes'))) %>%
na.omit()
glimpse(clean_titanic)

clean_titanic$pclass = ifelse(clean_titanic$pclass=="Lower",1,0)
clean_titanic$survived = ifelse(clean_titanic$survived=="Yes",1,0)
clean_titanic$sex = ifelse(clean_titanic$sex=="female",1,0)
clean_titanic$age_20 = ifelse(clean_titanic$age<=20,1,0)
clean_titanic$age_40 = ifelse(20<clean_titanic$age & clean_titanic$age<=40,1,0)
clean_titanic$age_o40 = ifelse(clean_titanic$age>40,1,0)

############################################ Data 클리닝
data = clean_titanic %>%
select(c(survived, pclass, sex, age_20, age_40, age_o40))
y=data$survived

x= clean_titanic %>%
select(c(pclass, sex, age_20, age_40, age_o40))
summary(x)

########################################## 초기값 설정
M = 10 # Number of itertation
eta = 0.7 # The learning rate
gam = 0 # controls regulaization
lambda = 0 # L2
max_depth = 2 # depth of the tree
############################################ 함수 정의

logloss = function(y,yhat){
ylog(yhat)+(1-y)log(1-yhat)
}

grafun = function(pred,label){
pred-label
}

hessfun = function(pred){
pred(1-pred)
}

predfun = function(y){
1/(1+exp(-y))
}

######################################## Initialize f0
n=nrow(data)
ybar = sum(data$survived)/n
f1 = log(ybar/(1-ybar))

ffun = data.frame(f1=rep(f1,n))
gfun = data.frame(g1=grafun(predfun(f1),data$survived))
hfun = data.frame(h1=rep(hessfun(predfun(f1)),n))

############################################
for(z in 1:M){
TreeIndex = data.frame(T1=rep(1,n))

coln = colnames(x)

for(q in 1:max_depth){
inmin = min(TreeIndex[,q])
inmax = max(TreeIndex[,q])
trnum=0
for(j in inmin:inmax){


ord_index= which(TreeIndex[q]==j)

gainlist=c()

for (k in 1:length(coln)){
subdl_index=which( data[,coln[k]]==0 & TreeIndex[q]==j)
subdr_index=which( data[,coln[k]]==1 & TreeIndex[q]==j)
if(length(subdl_index)>0 & length(subdr_index)>0) {
subdlg = sum(gfun[subdl_index,z])
subdrg = sum(gfun[subdr_index,z])

subdlh = sum(hfun[subdl_index,z])
subdrh = sum(hfun[subdr_index,z])

og = sum(gfun[ord_index,z])
oh = sum(hfun[ord_index,z])

gain = 1/2(subdlg^2/(subdlh+lambda)+subdrg^2/(subdrh+lambda)-og^2/(oh+lambda))-gam
gainlist[k] = gain
}
}

maxg=max(gainlist,na.rm = TRUE)

if(maxg>0){
splitindex= which(maxg==gainlist)
subdl_index=which( data[,coln[splitindex]]==0 & TreeIndex[q]==j)
subdr_index=which( data[,coln[splitindex]]==1 & TreeIndex[q]==j)

trnum=trnum+1
TreeIndex[subdl_index, q+1] =trnum
trnum=trnum+1
TreeIndex[subdr_index, q+1] =trnum

}
if(maxg<=0){

trnum=trnum+1
TreeIndex[ord_index, q+1] =trnum
}

}

inmin = min(TreeIndex[,q+1])
inmax = max(TreeIndex[,q+1])

for(p in inmin:inmax){
suindex = TreeIndex[,q+1]==p

subdg = sum(gfun[suindex,z])
subdh = sum(hfun[suindex,z])

ffun[suindex,z+1] = ffun[suindex,z] -subdg/subdh
gfun[suindex,z+1] = grafun(predfun(ffun[suindex,z+1]),data[suindex,]$survived)
hfun[suindex,z+1] = hessfun(predfun(ffun[suindex,z+1]))

}

}
}

data$pred = predfun(ffun$V11)

hist(data$pred)titanic = read.csv("/Users/user/Desktop/titanic3.csv")

str(titanic)

#install.packages("dplyr")

library(dplyr)

# Drop variables
titanic$pclass = titanic[,1]

clean_titanic <- titanic %>%select(c(pclass, survived, sex, age)) %>% na.omit()
dim(clean_titanic)

clean_titanic$pclass = ifelse(clean_titanic[,1]==1,1,0)

clean_titanic$survived = ifelse(clean_titanic$survived==1,1,0)

clean_titanic$sex = ifelse(clean_titanic$sex=="female",1,0)

clean_titanic$age_20 = ifelse(clean_titanic$age<=20,1,0)

clean_titanic$age_40 = ifelse(20<clean_titanic$age & clean_titanic$age<=40,1,0)

clean_titanic$age_o40 = ifelse(clean_titanic$age>40,1,0)

############################################ Data 클리닝

data = clean_titanic %>%
select(c(survived, pclass, sex, age_20, age_40, age_o40))

y=data$survived

x= clean_titanic %>%

select(c(pclass, sex, age_20, age_40, age_o40))

summary(x)

########################################## 초기값 설정

M = 5 # Number of itertation

eta = 0.7 # The learning rate

gam = 0 # controls regulaization

lambda = 0 # L2

max_depth = 2 # depth of the tree

############################################ 함수 정의

logloss = function(y,yhat){

ylog(yhat)+(1-y)log(1-yhat)

}

grafun = function(pred,label){

pred-label

}

hessfun = function(pred){

pred(1-pred)

}

predfun = function(y){

1/(1+exp(-y))

}

######################################## Initialize f0

n=nrow(data)

ybar = sum(data$survived)/n

f1 = log(ybar/(1-ybar))

ffun = data.frame(f1=rep(f1,n))

gfun = data.frame(g1=grafun(predfun(f1),data$survived))

hfun = data.frame(h1=rep(hessfun(predfun(f1)),n))

############################################

for(z in 1:M){

TreeIndex = data.frame(T1=rep(1,n))



coln = colnames(x)



for(q in 1:max_depth){

inmin = min(TreeIndex[,q])

inmax = max(TreeIndex[,q])

trnum=0

for(j in inmin:inmax){





ord_index= which(TreeIndex[q]==j)



gainlist=c()



for (k in 1:length(coln)){

subdl_index=which( data[,coln[k]]==0 & TreeIndex[q]==j)

subdr_index=which( data[,coln[k]]==1 & TreeIndex[q]==j)

if(length(subdl_index)>0 & length(subdr_index)>0) {

subdlg = sum(gfun[subdl_index,z])

subdrg = sum(gfun[subdr_index,z])



subdlh = sum(hfun[subdl_index,z])

subdrh = sum(hfun[subdr_index,z])



og = sum(gfun[ord_index,z])

oh = sum(hfun[ord_index,z])



gain = 1/2(subdlg^2/(subdlh+lambda)+subdrg^2/(subdrh+lambda)-og^2/(oh+lambda))-gam

gainlist[k] = gain

}

}



maxg=max(gainlist,na.rm = TRUE)



if(maxg>0){

splitindex= which(maxg==gainlist)

subdl_index=which( data[,coln[splitindex]]==0 & TreeIndex[q]==j)

subdr_index=which( data[,coln[splitindex]]==1 & TreeIndex[q]==j)



trnum=trnum+1

TreeIndex[subdl_index, q+1] =trnum

trnum=trnum+1

TreeIndex[subdr_index, q+1] =trnum



}

if(maxg<=0){



trnum=trnum+1

TreeIndex[ord_index, q+1] =trnum

}



}



inmin = min(TreeIndex[,q+1])

inmax = max(TreeIndex[,q+1])



for(p in inmin:inmax){

suindex = TreeIndex[,q+1]==p



subdg = sum(gfun[suindex,z])

subdh = sum(hfun[suindex,z])



ffun[suindex,z+1] = ffun[suindex,z] -subdg/subdh

gfun[suindex,z+1] = grafun(predfun(ffun[suindex,z+1]),data[suindex,]$survived)

hfun[suindex,z+1] = hessfun(predfun(ffun[suindex,z+1]))



}



}

}

data$pred = predfun(ffun$V6)

공지사항

최근에 올라온 글

최근에 달린 댓글

Total

Today

Yesterday

링크

TAG more

« 2025/05 »
일	월	화	수	목	금	토
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31

글 보관함

잡다한 데이터/머신러닝 이야기

티스토리 뷰

XGBOOST

XGBOOST

1. Initialize

=>

2. For m=1 to M :

3. 4

5. Determine the structure by selecting splits which maximize

6. Determine the leaf weights for the learnt structure by

7.

8.

9. end

티스토리툴바