티스토리 뷰
- XGBOOST의 핵심은 각 Obj를 Taylor 전개를 통해 변형하여, f를 이전 단계까지의 모델 y(t-1) hat의 함수인 g와 h의 조합으로 가능하게 됨
- 자세한 구현과정은 다음과 같음
6. Determine the leaf weights
for the learnt structure by
7. 
- 6번에서 각노드마다 를 계산하고 f(m-1)과 더하여 f(m)을 계산
8. 
9. end
path <- 'https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/titanic_csv.csv'
titanic <-read.csv(path)
head(titanic)
str(titanic)
#install.packages("dplyr")
library(dplyr)
# Drop variables
clean_titanic <- titanic %>%
select(-c(home.dest, cabin, name, X, ticket)) %>%
#Convert to factor level
mutate(pclass = factor(pclass, levels = c(1, 2, 3), labels = c('Upper', 'Middle', 'Lower')),
survived = factor(survived, levels = c(0, 1), labels = c('No', 'Yes'))) %>%
na.omit()
glimpse(clean_titanic)
clean_titanic$pclass = ifelse(clean_titanic$pclass=="Lower",1,0)
clean_titanic$survived = ifelse(clean_titanic$survived=="Yes",1,0)
clean_titanic$sex = ifelse(clean_titanic$sex=="female",1,0)
clean_titanic$age_20 = ifelse(clean_titanic$age<=20,1,0)
clean_titanic$age_40 = ifelse(20<clean_titanic$age & clean_titanic$age<=40,1,0)
clean_titanic$age_o40 = ifelse(clean_titanic$age>40,1,0)
############################################ Data 클리닝
data = clean_titanic %>%
select(c(survived, pclass, sex, age_20, age_40, age_o40))
y=data$survived
x= clean_titanic %>%
select(c(pclass, sex, age_20, age_40, age_o40))
summary(x)
########################################## 초기값 설정
M = 10 # Number of itertation
eta = 0.7 # The learning rate
gam = 0 # controls regulaization
lambda = 0 # L2
max_depth = 2 # depth of the tree
############################################ 함수 정의
logloss = function(y,yhat){
y*log(yhat)+(1-y)*log(1-yhat)
}
grafun = function(pred,label){
pred-label
}
hessfun = function(pred){
pred*(1-pred)
}
predfun = function(y){
1/(1+exp(-y))
}
######################################## Initialize f0
n=nrow(data)
ybar = sum(data$survived)/n
f1 = log(ybar/(1-ybar))
ffun = data.frame(f1=rep(f1,n))
gfun = data.frame(g1=grafun(predfun(f1),data$survived))
hfun = data.frame(h1=rep(hessfun(predfun(f1)),n))
############################################
for(z in 1:M){
TreeIndex = data.frame(T1=rep(1,n))
coln = colnames(x)
for(q in 1:max_depth){
inmin = min(TreeIndex[,q])
inmax = max(TreeIndex[,q])
trnum=0
for(j in inmin:inmax){
ord_index= which(TreeIndex[q]==j)
gainlist=c()
for (k in 1:length(coln)){
subdl_index=which( data[,coln[k]]==0 & TreeIndex[q]==j)
subdr_index=which( data[,coln[k]]==1 & TreeIndex[q]==j)
if(length(subdl_index)>0 & length(subdr_index)>0) {
subdlg = sum(gfun[subdl_index,z])
subdrg = sum(gfun[subdr_index,z])
subdlh = sum(hfun[subdl_index,z])
subdrh = sum(hfun[subdr_index,z])
og = sum(gfun[ord_index,z])
oh = sum(hfun[ord_index,z])
gain = 1/2*(subdlg^2/(subdlh+lambda)+subdrg^2/(subdrh+lambda)-og^2/(oh+lambda))-gam
gainlist[k] = gain
}
}
maxg=max(gainlist,na.rm = TRUE)
if(maxg>0){
splitindex= which(maxg==gainlist)
subdl_index=which( data[,coln[splitindex]]==0 & TreeIndex[q]==j)
subdr_index=which( data[,coln[splitindex]]==1 & TreeIndex[q]==j)
trnum=trnum+1
TreeIndex[subdl_index, q+1] =trnum
trnum=trnum+1
TreeIndex[subdr_index, q+1] =trnum
}
if(maxg<=0){
trnum=trnum+1
TreeIndex[ord_index, q+1] =trnum
}
}
inmin = min(TreeIndex[,q+1])
inmax = max(TreeIndex[,q+1])
for(p in inmin:inmax){
suindex = TreeIndex[,q+1]==p
subdg = sum(gfun[suindex,z])
subdh = sum(hfun[suindex,z])
ffun[suindex,z+1] = ffun[suindex,z] -subdg/subdh
gfun[suindex,z+1] = grafun(predfun(ffun[suindex,z+1]),data[suindex,]$survived)
hfun[suindex,z+1] = hessfun(predfun(ffun[suindex,z+1]))
}
}
}
data$pred = predfun(ffun$V11)
hist(data$pred)titanic = read.csv("/Users/user/Desktop/titanic3.csv")
str(titanic)
#install.packages("dplyr")
library(dplyr)
# Drop variables
titanic$pclass = titanic[,1]
clean_titanic <- titanic %>%select(c(pclass, survived, sex, age)) %>% na.omit()
dim(clean_titanic)
clean_titanic$pclass = ifelse(clean_titanic[,1]==1,1,0)
clean_titanic$survived = ifelse(clean_titanic$survived==1,1,0)
clean_titanic$sex = ifelse(clean_titanic$sex=="female",1,0)
clean_titanic$age_20 = ifelse(clean_titanic$age<=20,1,0)
clean_titanic$age_40 = ifelse(20<clean_titanic$age & clean_titanic$age<=40,1,0)
clean_titanic$age_o40 = ifelse(clean_titanic$age>40,1,0)
############################################ Data 클리닝
data = clean_titanic %>%
select(c(survived, pclass, sex, age_20, age_40, age_o40))
y=data$survived
x= clean_titanic %>%
select(c(pclass, sex, age_20, age_40, age_o40))
summary(x)
########################################## 초기값 설정
M = 5 # Number of itertation
eta = 0.7 # The learning rate
gam = 0 # controls regulaization
lambda = 0 # L2
max_depth = 2 # depth of the tree
############################################ 함수 정의
logloss = function(y,yhat){
y*log(yhat)+(1-y)*log(1-yhat)
}
grafun = function(pred,label){
pred-label
}
hessfun = function(pred){
pred*(1-pred)
}
predfun = function(y){
1/(1+exp(-y))
}
######################################## Initialize f0
n=nrow(data)
ybar = sum(data$survived)/n
f1 = log(ybar/(1-ybar))
ffun = data.frame(f1=rep(f1,n))
gfun = data.frame(g1=grafun(predfun(f1),data$survived))
hfun = data.frame(h1=rep(hessfun(predfun(f1)),n))
############################################
for(z in 1:M){
TreeIndex = data.frame(T1=rep(1,n))
coln = colnames(x)
for(q in 1:max_depth){
inmin = min(TreeIndex[,q])
inmax = max(TreeIndex[,q])
trnum=0
for(j in inmin:inmax){
ord_index= which(TreeIndex[q]==j)
gainlist=c()
for (k in 1:length(coln)){
subdl_index=which( data[,coln[k]]==0 & TreeIndex[q]==j)
subdr_index=which( data[,coln[k]]==1 & TreeIndex[q]==j)
if(length(subdl_index)>0 & length(subdr_index)>0) {
subdlg = sum(gfun[subdl_index,z])
subdrg = sum(gfun[subdr_index,z])
subdlh = sum(hfun[subdl_index,z])
subdrh = sum(hfun[subdr_index,z])
og = sum(gfun[ord_index,z])
oh = sum(hfun[ord_index,z])
gain = 1/2*(subdlg^2/(subdlh+lambda)+subdrg^2/(subdrh+lambda)-og^2/(oh+lambda))-gam
gainlist[k] = gain
}
}
maxg=max(gainlist,na.rm = TRUE)
if(maxg>0){
splitindex= which(maxg==gainlist)
subdl_index=which( data[,coln[splitindex]]==0 & TreeIndex[q]==j)
subdr_index=which( data[,coln[splitindex]]==1 & TreeIndex[q]==j)
trnum=trnum+1
TreeIndex[subdl_index, q+1] =trnum
trnum=trnum+1
TreeIndex[subdr_index, q+1] =trnum
}
if(maxg<=0){
trnum=trnum+1
TreeIndex[ord_index, q+1] =trnum
}
}
inmin = min(TreeIndex[,q+1])
inmax = max(TreeIndex[,q+1])
for(p in inmin:inmax){
suindex = TreeIndex[,q+1]==p
subdg = sum(gfun[suindex,z])
subdh = sum(hfun[suindex,z])
ffun[suindex,z+1] = ffun[suindex,z] -subdg/subdh
gfun[suindex,z+1] = grafun(predfun(ffun[suindex,z+1]),data[suindex,]$survived)
hfun[suindex,z+1] = hessfun(predfun(ffun[suindex,z+1]))
}
}
}
data$pred = predfun(ffun$V6)
path <- 'https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/titanic_csv.csv' titanic <-read.csv(path) head(titanic) str(titanic) #install.packages("dplyr") library(dplyr) # Drop variables clean_titanic <- titanic %>% select(-c(home.dest, cabin, name, X, ticket)) %>% #Convert to factor level mutate(pclass = factor(pclass, levels = c(1, 2, 3), labels = c('Upper', 'Middle', 'Lower')), survived = factor(survived, levels = c(0, 1), labels = c('No', 'Yes'))) %>% na.omit() glimpse(clean_titanic) clean_titanic$pclass = ifelse(clean_titanic$pclass=="Lower",1,0) clean_titanic$survived = ifelse(clean_titanic$survived=="Yes",1,0) clean_titanic$sex = ifelse(clean_titanic$sex=="female",1,0) clean_titanic$age_20 = ifelse(clean_titanic$age<=20,1,0) clean_titanic$age_40 = ifelse(20<clean_titanic$age & clean_titanic$age<=40,1,0) clean_titanic$age_o40 = ifelse(clean_titanic$age>40,1,0) ############################################ Data 클리닝 data = clean_titanic %>% select(c(survived, pclass, sex, age_20, age_40, age_o40)) y=data$survived x= clean_titanic %>% select(c(pclass, sex, age_20, age_40, age_o40)) summary(x) ########################################## 초기값 설정 M = 10 # Number of itertation eta = 0.7 # The learning rate gam = 0 # controls regulaization lambda = 0 # L2 max_depth = 2 # depth of the tree ############################################ 함수 정의 logloss = function(y,yhat){ y*log(yhat)+(1-y)*log(1-yhat) } grafun = function(pred,label){ pred-label } hessfun = function(pred){ pred*(1-pred) } predfun = function(y){ 1/(1+exp(-y)) }
n=nrow(data) ybar = sum(data$survived)/n f1 = log(ybar/(1-ybar)) ffun = data.frame(f1=rep(f1,n)) gfun = data.frame(g1=grafun(predfun(f1),data$survived)) hfun = data.frame(h1=rep(hessfun(predfun(f1)),n))
for(z in 1:M){ TreeIndex = data.frame(T1=rep(1,n)) coln = colnames(x) for(q in 1:max_depth){ inmin = min(TreeIndex[,q]) inmax = max(TreeIndex[,q]) trnum=0 for(j in inmin:inmax){
ord_index= which(TreeIndex[q]==j)
gainlist=c()
for (k in 1:length(coln)){ subdl_index=which( data[,coln[k]]==0 & TreeIndex[q]==j) subdr_index=which( data[,coln[k]]==1 & TreeIndex[q]==j) if(length(subdl_index)>0 & length(subdr_index)>0) { subdlg = sum(gfun[subdl_index,z]) subdrg = sum(gfun[subdr_index,z])
subdlh = sum(hfun[subdl_index,z]) subdrh = sum(hfun[subdr_index,z])
og = sum(gfun[ord_index,z]) oh = sum(hfun[ord_index,z])
gain = 1/2*(subdlg^2/(subdlh+lambda)+subdrg^2/(subdrh+lambda)-og^2/(oh+lambda))-gam gainlist[k] = gain } }
maxg=max(gainlist,na.rm = TRUE)
if(maxg>0){ splitindex= which(maxg==gainlist) subdl_index=which( data[,coln[splitindex]]==0 & TreeIndex[q]==j) subdr_index=which( data[,coln[splitindex]]==1 & TreeIndex[q]==j)
trnum=trnum+1 TreeIndex[subdl_index, q+1] =trnum trnum=trnum+1 TreeIndex[subdr_index, q+1] =trnum
} if(maxg<=0){
trnum=trnum+1 TreeIndex[ord_index, q+1] =trnum }
}
inmin = min(TreeIndex[,q+1]) inmax = max(TreeIndex[,q+1])
for(p in inmin:inmax){ suindex = TreeIndex[,q+1]==p
subdg = sum(gfun[suindex,z]) subdh = sum(hfun[suindex,z])
ffun[suindex,z+1] = ffun[suindex,z] -subdg/subdh gfun[suindex,z+1] = grafun(predfun(ffun[suindex,z+1]),data[suindex,]$survived) hfun[suindex,z+1] = hessfun(predfun(ffun[suindex,z+1]))
}
} } data$pred = predfun(ffun$V11) hist(data$pred)titanic = read.csv("/Users/user/Desktop/titanic3.csv") str(titanic) #install.packages("dplyr") library(dplyr) # Drop variables titanic$pclass = titanic[,1] clean_titanic <- titanic %>%select(c(pclass, survived, sex, age)) %>% na.omit() dim(clean_titanic) clean_titanic$pclass = ifelse(clean_titanic[,1]==1,1,0) clean_titanic$survived = ifelse(clean_titanic$survived==1,1,0) clean_titanic$sex = ifelse(clean_titanic$sex=="female",1,0) clean_titanic$age_20 = ifelse(clean_titanic$age<=20,1,0) clean_titanic$age_40 = ifelse(20<clean_titanic$age & clean_titanic$age<=40,1,0) clean_titanic$age_o40 = ifelse(clean_titanic$age>40,1,0) ############################################ Data 클리닝 data = clean_titanic %>% select(c(survived, pclass, sex, age_20, age_40, age_o40)) y=data$survived x= clean_titanic %>%
select(c(pclass, sex, age_20, age_40, age_o40)) summary(x) ########################################## 초기값 설정 M = 5 # Number of itertation eta = 0.7 # The learning rate gam = 0 # controls regulaization lambda = 0 # L2 max_depth = 2 # depth of the tree ############################################ 함수 정의 logloss = function(y,yhat){
y*log(yhat)+(1-y)*log(1-yhat)
} grafun = function(pred,label){
pred-label
} hessfun = function(pred){
pred*(1-pred)
} predfun = function(y){
1/(1+exp(-y))
} ######################################## Initialize f0 n=nrow(data) ybar = sum(data$survived)/n f1 = log(ybar/(1-ybar)) ffun = data.frame(f1=rep(f1,n)) gfun = data.frame(g1=grafun(predfun(f1),data$survived)) hfun = data.frame(h1=rep(hessfun(predfun(f1)),n)) ############################################ for(z in 1:M){
TreeIndex = data.frame(T1=rep(1,n))
coln = colnames(x)
for(q in 1:max_depth){
inmin = min(TreeIndex[,q])
inmax = max(TreeIndex[,q])
trnum=0
for(j in inmin:inmax){
ord_index= which(TreeIndex[q]==j)
gainlist=c()
for (k in 1:length(coln)){
subdl_index=which( data[,coln[k]]==0 & TreeIndex[q]==j)
subdr_index=which( data[,coln[k]]==1 & TreeIndex[q]==j)
if(length(subdl_index)>0 & length(subdr_index)>0) {
subdlg = sum(gfun[subdl_index,z])
subdrg = sum(gfun[subdr_index,z])
subdlh = sum(hfun[subdl_index,z])
subdrh = sum(hfun[subdr_index,z])
og = sum(gfun[ord_index,z])
oh = sum(hfun[ord_index,z])
gain = 1/2*(subdlg^2/(subdlh+lambda)+subdrg^2/(subdrh+lambda)-og^2/(oh+lambda))-gam
gainlist[k] = gain
}
}
maxg=max(gainlist,na.rm = TRUE)
if(maxg>0){
splitindex= which(maxg==gainlist)
subdl_index=which( data[,coln[splitindex]]==0 & TreeIndex[q]==j)
subdr_index=which( data[,coln[splitindex]]==1 & TreeIndex[q]==j)
trnum=trnum+1
TreeIndex[subdl_index, q+1] =trnum
trnum=trnum+1
TreeIndex[subdr_index, q+1] =trnum
}
if(maxg<=0){
trnum=trnum+1
TreeIndex[ord_index, q+1] =trnum
}
}
inmin = min(TreeIndex[,q+1])
inmax = max(TreeIndex[,q+1])
for(p in inmin:inmax){
suindex = TreeIndex[,q+1]==p
subdg = sum(gfun[suindex,z])
subdh = sum(hfun[suindex,z])
ffun[suindex,z+1] = ffun[suindex,z] -subdg/subdh
gfun[suindex,z+1] = grafun(predfun(ffun[suindex,z+1]),data[suindex,]$survived)
hfun[suindex,z+1] = hessfun(predfun(ffun[suindex,z+1]))
}
}
} data$pred = predfun(ffun$V6) |