x = np.arange(0,50)
x = pd.DataFrame({'x':x})
# just random uniform distributions in differnt range
y1 = np.random.uniform(10,15,10)
y2 = np.random.uniform(20,25,10)
y3 = np.random.uniform(0,5,10)
y4 = np.random.uniform(30,32,10)
y5 = np.random.uniform(13,17,10)
y = np.concatenate((y1,y2,y3,y4,y5))
y = y[:,None]
xi = x # initialization of input
yi = y # initialization of target
# x,y --> use where no need to change original y
ei = 0 # initialization of error
n = len(yi) # number of rows
predf = 0 # initial prediction 0
for i in range(30): # loop will make 30 trees (n_estimators).
tree = DecisionTree(xi,yi) # DecisionTree scratch code can be found in shared github/kaggle link.
# It just create a single decision tree with provided min. sample leaf
tree.find_better_split(0) # For selected input variable, this splits (n) data so that std. deviation of
# target variable in both splits is minimum as compared to all other splits
r = np.where(xi == tree.split)[0][0] # finds index where this best split occurs
left_idx = np.where(xi <= tree.split)[0] # index lhs of split
right_idx = np.where(xi > tree.split)[0] # index rhs of split
5.拟合剩余的残差模型。即[e2 = y – y_predicted2]并重复步骤2至5,直至开始过拟合或残差总和恒定。通过持续检查验证数据的准确性可以控制过度拟合。
# predictions by ith decisision tree
predi = np.zeros(n)
np.put(predi, left_idx, np.repeat(np.mean(yi[left_idx]), r)) # replace left side mean y
np.put(predi, right_idx, np.repeat(np.mean(yi[right_idx]), n-r)) # right side mean y
predi = predi[:,None] # make long vector (nx1) in compatible with y
predf = predf + predi # final prediction will be previous prediction value + new prediction of residual
ei = y - predf # needed originl y here as residual always from original y
yi = ei # update yi as residual to reloop