train classification model from a dataset created by a query
CREATE OR REPLACE MODEL `ecommerce.classification_model`
OPTIONS
(
model_type='logistic_reg',
labels = ['will_buy_on_return_visit']
)
AS
#standardSQL
SELECT
* EXCEPT(fullVisitorId)
FROM
# features
(SELECT
fullVisitorId,
IFNULL(totals.bounces, 0) AS bounces,
IFNULL(totals.timeOnSite, 0) AS time_on_site
FROM
`data-to-insights.ecommerce.web_analytics`
WHERE
totals.newVisits = 1
AND date BETWEEN '20160801' AND '20170430') # train on first 9 months
JOIN
(SELECT
fullvisitorid,
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM
`data-to-insights.ecommerce.web_analytics`
GROUP BY fullvisitorid)
USING (fullVisitorId)
;
evaluate roc auc score
SELECT
roc_auc,
CASE
WHEN roc_auc > .9 THEN 'good'
WHEN roc_auc > .8 THEN 'fair'
WHEN roc_auc > .7 THEN 'not great'
ELSE
'poor'
END
AS model_quality
FROM
ML.EVALUATE(MODEL ecommerce.classification_model,
(
SELECT
* EXCEPT(fullVisitorId)
FROM
# features
(
SELECT
fullVisitorId,
IFNULL(totals.bounces,
0) AS bounces,
IFNULL(totals.timeOnSite,
0) AS time_on_site
FROM
`data-to-insights.ecommerce.web_analytics`
WHERE
totals.newVisits = 1
AND date BETWEEN '20170501'
AND '20170630') # eval on 2 months
JOIN (
SELECT
fullvisitorid,
IF
(COUNTIF(totals.transactions > 0
AND totals.newVisits IS NULL) > 0,
1,
0) AS will_buy_on_return_visit
FROM
`data-to-insights.ecommerce.web_analytics`
GROUP BY
fullvisitorid)
USING
(fullVisitorId) ));
predict with trained model on the later dataset
SELECT
*
FROM
ml.PREDICT(MODEL `ecommerce.classification_model_2`,
(
WITH
all_visitor_stats AS (
SELECT
fullvisitorid,
IF
(COUNTIF(totals.transactions > 0
AND totals.newVisits IS NULL) > 0,
1,
0) AS will_buy_on_return_visit
FROM
`data-to-insights.ecommerce.web_analytics`
GROUP BY
fullvisitorid )
SELECT
CONCAT(fullvisitorid, '-',CAST(visitId AS STRING)) AS unique_session_id,
# labels
will_buy_on_return_visit,
MAX(CAST(h.eCommerceAction.action_type AS INT64)) AS latest_ecommerce_progress,
# behavior on the site
IFNULL(totals.bounces,
0) AS bounces,
IFNULL(totals.timeOnSite,
0) AS time_on_site,
totals.pageviews,
# where the visitor came from
trafficSource.source,
trafficSource.medium,
channelGrouping,
# mobile or desktop
device.deviceCategory,
# geographic
IFNULL(geoNetwork.country,
"") AS country
FROM
`data-to-insights.ecommerce.web_analytics`,
UNNEST(hits) AS h
JOIN
all_visitor_stats
USING
(fullvisitorid)
WHERE
# only predict for new visits
totals.newVisits = 1
AND date BETWEEN '20170701'
AND '20170801' # test 1 month
GROUP BY
unique_session_id,
will_buy_on_return_visit,
bounces,
time_on_site,
totals.pageviews,
trafficSource.source,
trafficSource.medium,
channelGrouping,
device.deviceCategory,
country ) )
ORDER BY
predicted_will_buy_on_return_visit DESC;
types of models available
https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create#model_type
Deep learning model creation from sql syntax ref
https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-dnn-models
XGBoost model creation from sql syntax ref
https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-boosted-tree
AutoML syntax and ref for sql in BQ
https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-automl