Performed on So Jun 19 16:11:24 2016.
df <- dbGetQuery(jdbcConnection,
"with rn as (
select DBMS_RANDOM.VALUE(0,1) rn1,
DBMS_RANDOM.VALUE(0,1) rn2
from dual connect by level <= 100
), rate as (
select round(rn1*100) vol,
floor(rn2*3) tarif
from rn)
select
tarif line_id,
vol x,
round(
case when tarif = 0 then 50
when tarif = 1 then vol* 2.5
when tarif = 2 then 50 + (vol-50) * 1.5 end) as y
from rate
")
head(df[,c("X","Y")])
## X Y
## 1 30 75
## 2 5 50
## 3 40 35
## 4 66 165
## 5 57 143
## 6 34 50
The data is stored in the file linear_clustering_avd.csv
LINE_ID use case 1,2
X,Y coordinates of the point
Example
## [1] TRUE
Following formulas are used: i - intercept; c - coefficient
A(y) = i + c * A(x)
B(y) = i + c * B(x)
A(y) - B(y) = c * (A(x) - B(x))
c = (A(y) - B(y)) / (A(x) - B(x))
i = A(y) - c * A(x)
df.coeff <- dbGetQuery(jdbcConnection,
"-- derive intercept + coeff
with derive as (
select a.x a_x, b.x b_x, a.x - b.x delta_x,
a.y a_y, b.y b_y, a.y - b.y delta_y
from LINEAR_CLUST_ADV a, LINEAR_CLUST_ADV b
where a.x > b.x + 5 /* minimum line length */
), derive2 as (
select
a_x, b_x,a_y, b_y,
round(DELTA_Y / DELTA_X,6) coeff
from derive
), clust as (
select a_x, b_x,a_y, b_y,
a_y - coeff * a_x as intercept,
coeff
from derive2)
select A_X, B_X, A_Y, B_Y, INTERCEPT, COEFF from clust")
head(df.coeff)
## A_X B_X A_Y B_Y INTERCEPT COEFF
## 1 99 93 124 233 1922.500033 -18.166667
## 2 99 91 124 112 -24.500000 1.500000
## 3 99 90 124 110 -30.000044 1.555556
## 4 99 90 124 110 -30.000044 1.555556
## 5 99 89 124 50 -608.600000 7.400000
## 6 99 88 124 107 -29.000045 1.545455
library(ggplot2)
library(hexbin)
dfs <- subset(df.coeff, COEFF > -1 & COEFF < 4 & INTERCEPT > -30 & INTERCEPT < 60)
xy <- ggplot(dfs, aes(x=COEFF, y=INTERCEPT)) +
geom_hex(binwidth=c((max(dfs$COEFF)-min(dfs$COEFF))/50, (max(dfs$INTERCEPT)-min(dfs$INTERCEPT))/50)) +
scale_fill_continuous(lim=c(25,700), na.value=NA) +
labs(title = "Linear Clustering - Intercept and Coefficient")
print (xy)
The coefficients (setup , coeff) are visualized in bubble graph. The size of the bubble corresponds to the number of points of a particular line.
library(ggplot2)
ggplot(df.agg, aes(x=COEFF, y=INTERCEPT, size=POINT_DIST_CNT, label = ''),guide=FALSE)+
geom_point(alpha=0.6, shape=21)+ scale_size_area(max_size = 10)+
scale_x_continuous(name="Coefficient", limits=c(-1,4))+
scale_y_continuous(name="Intercept", limits=c(-30,60))+
geom_text(size=4)+
theme_bw() +
ggtitle("Setup and Coefficients - Linear Clustering")
## Warning: Removed 1245 rows containing missing values (geom_point).
## Warning: Removed 1245 rows containing missing values (geom_text).
In the next step the most frequent intercept and coefficient are found by simple aggregating and counting. More precise result may be found using clustering with customized distance measure.
df.agg <- dbGetQuery(jdbcConnection,
"-- derive intercept + coeff
with derive as (
select a.x a_x, b.x b_x, a.x - b.x delta_x,
a.y a_y, b.y b_y, a.y - b.y delta_y
from LINEAR_CLUST_ADV a, LINEAR_CLUST_ADV b
where a.x > b.x + 5 /* minimum line length */
), derive2 as (
select
a_x, b_x,a_y, b_y,
round(DELTA_Y / DELTA_X,6) coeff
from derive
), clust as (
select a_x, b_x,a_y, b_y,
a_y - coeff * a_x as intercept,
coeff
from derive2),
clust2 as (
select intercept, coeff, count(*) cnt, count(distinct a_x ||','|| b_x||','||a_y||','||b_y) point_dist_cnt
from clust
group by intercept, coeff
)
select * from clust2
order by POINT_DIST_CNT desc
")
head(df.agg)
## INTERCEPT COEFF CNT POINT_DIST_CNT
## 1 50.000000 0.000000 509 372
## 2 -25.000000 1.500000 247 208
## 3 0.000000 2.500000 118 70
## 4 0.500000 2.500000 63 63
## 5 -24.500000 1.500000 57 57
## 6 -25.565211 1.521739 2 2
## Plot
dfc <- subset(df.agg,POINT_DIST_CNT > 40)
xy <- xyplot(Y ~ X,
auto.key=TRUE,
panel = function(x,y,...) {
panel.xyplot(x,y,...)
if (nrow(dfc) > 0) {
for (i in 1 : nrow(dfc)) {
panel.abline(a= dfc[i,"INTERCEPT"],b = dfc[i,"COEFF"], col = "red")}}},
data = df , type=c("p","g"),
scales=list(x=list(rot=90, cex= .9 ),y=list(cex=.9)),par.strip.text=list(cex=.8),
ylab="y", xlab="x", main= "Linear Clustering - Matched Lines" )
print (xy)