Gradient Steepest Descent

Gradient Steepest Descent
YIK LUN, KEI
Minimize SSR
error<-0
points<-matrix(c(1,2,3,4,2,6,4,8),ncol=2)
toterror<-function(points,b0,b1){
for(i in 1:dim(points)[1]){
error <- error + (points[i,2] - (b0+b1*points[i,1]))^2
}
error
}
b0_grad<-function(points,b0,b1){
grad = 0
grad = grad + -2 * (points[i,2]-(b0+b1*points[i,1]))
}
grad
}
b1_grad<-function(points,b0,b1){
grad = 0
grad = grad + -2 * points[i,1]*(points[i,2]-(b0+b1*points[i,1]))
}
grad
}
b0=-2;b1=1;alpha=0.025
b0hist<-rep(0,1000)
b1hist<-rep(0,1000)
for(i in 1:1000){
b0_step <- alpha * b0_grad(points,b0,b1)
b1_step <- alpha * b1_grad(points,b0,b1)
b0 = b0 - b0_step# update b0 and b1 after both stepings
b1 = b1 - b1_step
b0hist[i] <- b0
b1hist[i] <- b1
}
b0;b1
## [1] 1
## [1] 1.6
bhist<-cbind(b0hist,b1hist)
b0m<-seq(-3,3, by=0.01)
b1m<-seq(-1,3.5, by=0.01)
z <- outer(b0m,b1m,FUN="toterror",points=points)
contour(b0m,b1m,z,nlevels = 40)
points(bhist,type="l",col="blue")
30
40
50
60
70
80
90
100 110 120 130
140 150 160 17

0
20
10
10
30
20
50 40
60
80
9
120 100
110 0
140
160
70
370 36
270 250 240 23 210 20
0 350 34
0
0
0 330 310
430 42
3
20 300 290 28
0 410
0 260
180
220
Matrix Form
x0 <- rep(1,4)
x1 <- 1:4
x <- cbind(x0,x1)
y <- as.matrix(c(2,6,4,8))
theta <- matrix(c(-2,1))
thetahist <- matrix(NA,nrow=1000,ncol=2)
costhist <- matrix(NA, nrow=1000,ncol=1)
alpha = 0.025
h<-function(x,theta) x %*% theta # fitted y
cost <- function(x,y,theta){sum((x %*% theta - y)^2)}
grad<-function(x,y,theta) 2 * t(x) %*% (x %*% theta - y)
solve(t(x) %*% x) %*% t(x) %*% y;lm(y~x + 0)
150 130
190 170
##
[,1]
## x0 1.0
## x1 1.6
##
##
##
##
##
##
##
Call:
lm(formula = y ~ x + 0)
Coefficients:
xx0 xx1
1.0 1.6
for(i in 1:1000){
theta <- theta - alpha * grad(x,y,theta)
thetahist[i,] <- t(theta)
costhist[i,] <- cost(x,y,theta)
}
theta
##
[,1]
## x0 1.0
## x1 1.6
b0m<-seq(-3,3, by=0.01)
b1m<-seq(-1,3.5, by=0.01)
points(thetahist,type="l",col="red")
30
40
50
60
70
80
90
100 110 120 130
140 150 160 17

0
20
10
10
30
20
50 40
60
80
1
0
0
9
120
110 0
140
160
70
370 36
270 250 240 23 210 20
0 350 34
0
0
0 330 310
430 42
320 300 290 2
0 410
80 260
180
220
150 130
190 170
Standardized data makes convergence faster

x.scaled<-x
x.scaled[,2] <- (x[,2] - mean(x[,2])) / sd(x[,2])
solve(t(x.scaled) %*% x.scaled) %*% t(x.scaled) %*% y;lm(y~x.scaled[,2])
##
[,1]
## x0 5.000000
## x1 2.065591
##
## Call:
## lm(formula = y ~ x.scaled[, 2])
##
## Coefficients:
##
(Intercept) x.scaled[, 2]
##
5.000
2.066
theta <- matrix(c(-2,1))
thetahist <- matrix(NA,nrow=200,ncol=2)
costhist.scaled <- matrix(NA,nrow=200,ncol=1)
alpha = 0.025
h<-function(x,theta) x %*% theta # fitted y

grad<-function(x,y,theta) 2 * t(x) %*% (x %*% theta - y)
for(i in 1:200){
theta <- theta - alpha * grad(x.scaled,y,theta)
thetahist[i,] <- t(theta)
costhist.scaled[i,] <- cost(x.scaled,y,theta)
}
theta
##
[,1]
## x0 5.000000
## x1 2.065591
b0m<-seq(2,6, by=0.01)
b1m<-seq(-3,6, by=0.01)
points<-cbind(x.scaled[,2],y)
points(thetahist,type="l",col="blue")
85
80
75
65
70
55
60
45
55
50
40
30
20
10
15
25
35
55
70
115
110
105
100
95
90
85
50
65
45
60
80 75
85
Compare: scaled data
30
10
20
costhist
40
plot(costhist,type="l",xlim=c(0,50))
lines(costhist.scaled,type="l",col="red")
10
20
30
Index
Compare: alpha
alpha2 = 0.01
theta2 <- matrix(c(-2,1))
thetahist2 <- matrix(NA,nrow=1000,ncol=2)
costhist2 <- matrix(NA,nrow=1000,ncol=1)
for(i in 1:1000){
theta2 <- theta2 - alpha2 * grad(x,y,theta2)
thetahist2[i,] <- t(theta2)
costhist2[i,] <- cost(x,y,theta2)
}
plot(costhist,type="l",xlim=c(1,50))
lines(costhist2,type="l",col="red")
40
50
40
costhist
30
20
10
0
10
20
30
Index
40
50

Gradient Steepest Descent

Enviado por

Dados do documento

Título original

Direitos autorais

Formatos disponíveis

Compartilhar este documento

Compartilhar ou incorporar documento

Opções de compartilhamento

Você considera este documento útil?

Este conteúdo é inapropriado?

Direitos autorais:

Formatos disponíveis

Gradient Steepest Descent

Enviado por

Direitos autorais:

Formatos disponíveis

Gradient Steepest Descent

YIK LUN, KEI

100 110 120 130

140 150 160 17

100 110 120 130

140 150 160 17

Standardized data makes convergence faster

h<-function(x,theta) x %*% theta # fitted y

Compare: scaled data

Você também pode gostar