31. page 31
-- linear regression (y=ax+b)
-- estimate price from size.
WITH records AS (
-- size(x), price(y)
SELECT * FROM (
VALUES
(16, 86800),
(64, 98800),
(128, 110800)
) AS t (x_value, y_value)
), prepare AS (
SELECT
SUM(x_value) as sumx,
SUM(y_value) as sumy,
SUM(x_value*x_value) as sumxx,
SUM(y_value*y_value) as sumyy,
SUM(x_value*y_value) as sumxy,
COUNT(1) as cnt
FROM records
)
SELECT
bunshi_a / bunbo as slope,
bunshi_b / bunbo as intercept
FROM
(
SELECT
(cnt * sumxx - pow(sumx, 2)) AS bunbo,
(cnt * sumxy - sumx * sumy) AS bunshi_a,
(sumxx * sumy - sumxy * sumx) AS bunshi_b
FROM prepare
) calc
標準的なSQLで求める線形回帰分析
36. page 36
-- (Presto) y = intercept + x * slope
WITH records AS (
SELECT * FROM (
VALUES
(16, 86800),
(64, 98800),
(128, 110800)
) AS t (x_value, y_value)
)
SELECT
regr_intercept(y_value, x_value) as intercept,
regr_slope(y_value, x_value) as slope
FROM records
37. page 37
-- (Presto UDF) y = intercept + x * slope
WITH records AS (
SELECT * FROM (
VALUES
(16, 86800),
(64, 98800),
(128, 110800)
) AS t (x_value, y_value)
)
SELECT
regr_intercept(y_value, x_value) as intercept,
regr_slope(y_value, x_value) as slope
FROM records
-- linear regression (y=ax+b)
-- estimate price from size.
WITH records AS (
-- size(x), price(y)
SELECT * FROM (
VALUES
(16, 86800),
(64, 98800),
(128, 110800)
) AS t (x_value, y_value)
), prepare AS (
SELECT
SUM(x_value) as sumx,
SUM(y_value) as sumy,
SUM(x_value*x_value) as sumxx,
SUM(y_value*y_value) as sumyy,
SUM(x_value*y_value) as sumxy,
COUNT(1) as cnt
FROM records
)
SELECT
bunshi_a / bunbo as slope,
bunshi_b / bunbo as intercept
FROM
(
SELECT
(cnt * sumxx - pow(sumx, 2)) AS bunbo,
(cnt * sumxy - sumx * sumy) AS bunshi_a,
(sumxx * sumy - sumxy * sumx) AS bunshi_b
FROM prepare
) calc
たったこれだけ!
44. page 44
-- hivemallを用いた回帰分析
WITH records AS (
SELECT 16 as x_value, 86800 as y_value
UNION ALL
SELECT 64 as x_value, 98800 as y_value
UNION ALL
SELECT 128 as x_value, 110800 as y_value
)
SELECT * FROM records
46. page 46
SELECT
row_number() over () AS rowid,
y_value AS target,
array_remove(Array(
'bias',
CONCAT('size:', x_value),
IF(x_value=128, '128GB', NULL)
), NULL) as features
FROM records
49. page 49
SELECT
arowe_regress(features, target) AS (feature,
weight, covar)
FROM
(
select * from (
select
amplify(20000, rowid, target, features)
AS (rowid, target, features)
from train
) t CLUSTER BY rand(1)
) train_amplify
53. page 53
-- Hivemall AROWe regression
WITH records AS (
SELECT 16 as x_value, 86800 as y_value
UNION ALL
SELECT 64 as x_value, 98800 as y_value
UNION ALL
SELECT 128 as x_value, 110800 as y_value
), train AS (
SELECT
row_number() over () AS rowid,
y_value AS target,
array_remove(Array(
'bias',
CONCAT('size:', x_value),
IF(x_value=128, '128GB', NULL)
), NULL) as features
FROM records
), regression AS (
SELECT
arowe_regress(features, target) AS (feature, weight, covar)
FROM
(
select * from (
select
amplify(20000, rowid, target, features)
AS (rowid, target, features)
from train
) t CLUSTER BY rand(1)
) train_amplify
), model AS (
SELECT
feature,
argmin_kld( weight, covar ) AS weight
FROM
regression
GROUP BY
feature
)
SELECT * FROM model
Hivemallを用いた回帰分析