티스토리 뷰

카테고리 없음

XGBOOST

최영민85 2018. 8. 10. 22:49

- 위의 Obj함수를 최소화하기 위해 Adaptive basis function model를 sequentially하게 tree model을 더해 기존 모델의 성능을 향상 시킴
- Constant Prediction으로 시작하여, 이전 round 모델에 new function(f)를 더해 나가게 되며 각 단계에서 obj를 최소화 하는 f를 찾게 됨

- XGBOOST의 핵심은 각 Obj를 Taylor 전개를 통해 변형하여, f를 이전 단계까지의 모델 y(t-1) hat의 함수인 g와 h의 조합으로 가능하게 됨 

- 자세한 구현과정은 다음과 같음 

Gain은 Tree가 Split 되기 전 후의 Obj(loss)의 차이다. 
Gain이 특정 값 이상을 경우 Split을 진행 함
Split 대상 노드가 아닌경우(T2, T3, T4)는 동일 하므로, Split대상 노드(T1)과 Split 된 후의
노드(T1L, T2L)의 loss만 비교하면 된다. 

이렇게 구한 gain은 variable importance을 계산하는 값이 된다. 



6. Determine the leaf weights   for the learnt structure by 

 


7. 

- 6번에서 각노드마다 를 계산하고 f(m-1)과 더하여 f(m)을 계산 



8. 

9. end

output : 

Reference 
- https://medium.com/syncedreview/tree-boosting-with-xgboost-why-does-xgboost-win-every-machine-learning-competition-ca8034c0b283
- https://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf
- http://www.saedsayad.com/docs/xgboost.pdf


path <- 'https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/titanic_csv.csv'

titanic <-read.csv(path)

head(titanic)


str(titanic)


#install.packages("dplyr")

library(dplyr)

# Drop variables

clean_titanic <- titanic %>%

  select(-c(home.dest, cabin, name, X, ticket)) %>% 

  #Convert to factor level

  mutate(pclass = factor(pclass, levels = c(1, 2, 3), labels = c('Upper', 'Middle', 'Lower')),

         survived = factor(survived, levels = c(0, 1), labels = c('No', 'Yes'))) %>%

  na.omit()

glimpse(clean_titanic)



clean_titanic$pclass = ifelse(clean_titanic$pclass=="Lower",1,0)

clean_titanic$survived = ifelse(clean_titanic$survived=="Yes",1,0)

clean_titanic$sex = ifelse(clean_titanic$sex=="female",1,0)

clean_titanic$age_20 = ifelse(clean_titanic$age<=20,1,0)

clean_titanic$age_40 = ifelse(20<clean_titanic$age & clean_titanic$age<=40,1,0)

clean_titanic$age_o40 = ifelse(clean_titanic$age>40,1,0)


############################################ Data 클리닝

data = clean_titanic %>%

  select(c(survived, pclass, sex, age_20, age_40, age_o40))

y=data$survived


x= clean_titanic %>%

  select(c(pclass, sex, age_20, age_40, age_o40))

summary(x)


########################################## 초기값 설정

M = 10 # Number of itertation 

eta = 0.7 # The learning rate

gam = 0 # controls regulaization 

lambda = 0 # L2

max_depth = 2 # depth of the tree

############################################ 함수 정의


logloss = function(y,yhat){

  y*log(yhat)+(1-y)*log(1-yhat)

}


grafun = function(pred,label){

  pred-label

}


hessfun = function(pred){

  pred*(1-pred)

}


predfun = function(y){

  1/(1+exp(-y))

}


######################################## Initialize f0

n=nrow(data)

ybar = sum(data$survived)/n

f1 = log(ybar/(1-ybar))



ffun = data.frame(f1=rep(f1,n))

gfun = data.frame(g1=grafun(predfun(f1),data$survived))

hfun = data.frame(h1=rep(hessfun(predfun(f1)),n))




############################################ 

for(z in 1:M){

TreeIndex = data.frame(T1=rep(1,n))


coln = colnames(x)


for(q in 1:max_depth){

inmin = min(TreeIndex[,q])

inmax = max(TreeIndex[,q])

trnum=0

  for(j in inmin:inmax){

  

 

  ord_index= which(TreeIndex[q]==j)

  

  gainlist=c()

  

  for (k in 1:length(coln)){

    subdl_index=which( data[,coln[k]]==0 & TreeIndex[q]==j)

    subdr_index=which( data[,coln[k]]==1 & TreeIndex[q]==j)

    if(length(subdl_index)>0 & length(subdr_index)>0) {

    subdlg = sum(gfun[subdl_index,z])

    subdrg = sum(gfun[subdr_index,z])

    

    subdlh = sum(hfun[subdl_index,z])

    subdrh = sum(hfun[subdr_index,z])

      

    og = sum(gfun[ord_index,z])

    oh = sum(hfun[ord_index,z])

    

    gain = 1/2*(subdlg^2/(subdlh+lambda)+subdrg^2/(subdrh+lambda)-og^2/(oh+lambda))-gam

    gainlist[k] = gain

  }

  }

  

  maxg=max(gainlist,na.rm = TRUE)

  

  if(maxg>0){

  splitindex= which(maxg==gainlist)

  subdl_index=which( data[,coln[splitindex]]==0 & TreeIndex[q]==j)

  subdr_index=which( data[,coln[splitindex]]==1 & TreeIndex[q]==j)

  

  trnum=trnum+1

  TreeIndex[subdl_index, q+1] =trnum 

  trnum=trnum+1

  TreeIndex[subdr_index, q+1] =trnum

  

  }

  if(maxg<=0){

  

    trnum=trnum+1

    TreeIndex[ord_index, q+1] =trnum 

  }

  

  }

  

  inmin = min(TreeIndex[,q+1])

  inmax = max(TreeIndex[,q+1])

  

  for(p in inmin:inmax){

    suindex = TreeIndex[,q+1]==p

    

    subdg = sum(gfun[suindex,z])

    subdh = sum(hfun[suindex,z])

    

    ffun[suindex,z+1] = ffun[suindex,z] -subdg/subdh

    gfun[suindex,z+1] = grafun(predfun(ffun[suindex,z+1]),data[suindex,]$survived)

    hfun[suindex,z+1] = hessfun(predfun(ffun[suindex,z+1]))

    

  }

  

}

}


data$pred = predfun(ffun$V11)


hist(data$pred)titanic = read.csv("/Users/user/Desktop/titanic3.csv")



str(titanic)




#install.packages("dplyr")


library(dplyr)


# Drop variables

titanic$pclass = titanic[,1]


clean_titanic <- titanic %>%select(c(pclass, survived, sex, age)) %>%  na.omit()

dim(clean_titanic)


clean_titanic$pclass = ifelse(clean_titanic[,1]==1,1,0)


clean_titanic$survived = ifelse(clean_titanic$survived==1,1,0)


clean_titanic$sex = ifelse(clean_titanic$sex=="female",1,0)


clean_titanic$age_20 = ifelse(clean_titanic$age<=20,1,0)


clean_titanic$age_40 = ifelse(20<clean_titanic$age & clean_titanic$age<=40,1,0)


clean_titanic$age_o40 = ifelse(clean_titanic$age>40,1,0)




############################################ Data 클리닝


data = clean_titanic %>%

  select(c(survived, pclass, sex, age_20, age_40, age_o40)) 


y=data$survived




x= clean_titanic %>%

  

  select(c(pclass, sex, age_20, age_40, age_o40))


summary(x)




########################################## 초기값 설정


M = 5 # Number of itertation 


eta = 0.7 # The learning rate


gam = 0 # controls regulaization 


lambda = 0 # L2


max_depth = 2 # depth of the tree


############################################ 함수 정의



logloss = function(y,yhat){

  

  y*log(yhat)+(1-y)*log(1-yhat)

  

}




grafun = function(pred,label){

  

  pred-label

  

}




hessfun = function(pred){

  

  pred*(1-pred)

  

}




predfun = function(y){

  

  1/(1+exp(-y))

  

}



######################################## Initialize f0


n=nrow(data)


ybar = sum(data$survived)/n


f1 = log(ybar/(1-ybar))



ffun = data.frame(f1=rep(f1,n))


gfun = data.frame(g1=grafun(predfun(f1),data$survived))


hfun = data.frame(h1=rep(hessfun(predfun(f1)),n))




############################################ 


for(z in 1:M){

  

  TreeIndex = data.frame(T1=rep(1,n))

  

  

  

  coln = colnames(x)

  

  

  

  for(q in 1:max_depth){

    

    inmin = min(TreeIndex[,q])

    

    inmax = max(TreeIndex[,q])

    

    trnum=0

    

    for(j in inmin:inmax){

      

      

      

      

      

      ord_index= which(TreeIndex[q]==j)

      

      

      

      gainlist=c()

      

      

      

      for (k in 1:length(coln)){

        

        subdl_index=which( data[,coln[k]]==0 & TreeIndex[q]==j)

        

        subdr_index=which( data[,coln[k]]==1 & TreeIndex[q]==j)

        

        if(length(subdl_index)>0 & length(subdr_index)>0) {

          

          subdlg = sum(gfun[subdl_index,z])

          

          subdrg = sum(gfun[subdr_index,z])

          

          

          

          subdlh = sum(hfun[subdl_index,z])

          

          subdrh = sum(hfun[subdr_index,z])

          

          

          

          og = sum(gfun[ord_index,z])

          

          oh = sum(hfun[ord_index,z])

          

          

          

          gain = 1/2*(subdlg^2/(subdlh+lambda)+subdrg^2/(subdrh+lambda)-og^2/(oh+lambda))-gam

          

          gainlist[k] = gain

          

        }

        

      }

      

      

      

      maxg=max(gainlist,na.rm = TRUE)

      

      

      

      if(maxg>0){

        

        splitindex= which(maxg==gainlist)

        

        subdl_index=which( data[,coln[splitindex]]==0 & TreeIndex[q]==j)

        

        subdr_index=which( data[,coln[splitindex]]==1 & TreeIndex[q]==j)

        

        

        

        trnum=trnum+1

        

        TreeIndex[subdl_index, q+1] =trnum 

        

        trnum=trnum+1

        

        TreeIndex[subdr_index, q+1] =trnum

        

        

        

      }

      

      if(maxg<=0){

        

        

        

        trnum=trnum+1

        

        TreeIndex[ord_index, q+1] =trnum 

        

      }

      

      

      

    }

    

    

    

    inmin = min(TreeIndex[,q+1])

    

    inmax = max(TreeIndex[,q+1])

    

    

    

    for(p in inmin:inmax){

      

      suindex = TreeIndex[,q+1]==p

      

      

      

      subdg = sum(gfun[suindex,z])

      

      subdh = sum(hfun[suindex,z])

      

      

      

      ffun[suindex,z+1] = ffun[suindex,z] -subdg/subdh

      

      gfun[suindex,z+1] = grafun(predfun(ffun[suindex,z+1]),data[suindex,]$survived)

      

      hfun[suindex,z+1] = hessfun(predfun(ffun[suindex,z+1]))

      

      

      

    }

    

    

    

  }

  

}




data$pred = predfun(ffun$V6)



공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
TAG
more
«   2025/05   »
1 2 3
4 5 6 7 8 9 10
11 12 13 14 15 16 17
18 19 20 21 22 23 24
25 26 27 28 29 30 31
글 보관함