K-means clustering is a method of cluster analysis used in predictive analysis. It partitions observations or records into clusters based on their proximity to a center. This algorithm is commonly used in marketing and customer relationship management to track customer behavior and create business strategies. Clustering works by grouping similar records together based on a mathematical formula that finds centroids. The k-means algorithm involves an assignment step, where each observation is assigned to the cluster with the closest mean, and an update step, where new means are calculated based on the observations in the cluster. The algorithm repeats until the assignments no longer change. The k-means implementation supports multi-threading, data normalization, different distance level measurements, and cluster quality measurement. It does not support categorical data directly, but categorical attributes can be managed through data transformation.
------

SET SCHEMA DM_PAL;

DROP TABLE PAL_KMEANS_DATA_TBL;
CREATE COLUMN TABLE PAL_KMEANS_DATA_TBL(
	"ID" INTEGER,
	"V000" DOUBLE,
	"V001" VARCHAR(2),
	"V002" DOUBLE
);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (0, 0.5, 'A', 0.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (1, 1.5, 'A', 0.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (2, 1.5, 'A', 1.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (3, 0.5, 'A', 1.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (4, 1.1, 'B', 1.2);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (5, 0.5, 'B', 15.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (6, 1.5, 'B', 15.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (7, 1.5, 'B', 16.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (8, 0.5, 'B', 16.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (9, 1.2, 'C', 16.1);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (10, 15.5, 'C', 15.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (11, 16.5, 'C', 15.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (12, 16.5, 'C', 16.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (13, 15.5, 'C', 16.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (14, 15.6, 'D', 16.2);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (15, 15.5, 'D', 0.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (16, 16.5, 'D', 0.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (17, 16.5, 'D', 1.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (18, 15.5, 'D', 1.5);
INSERT INTO PAL_KMEANS_DATA_TBL VALUES (19, 15.7, 'A', 1.6);

DROP TABLE #PAL_PARAMETER_TBL;
CREATE LOCAL TEMPORARY COLUMN TABLE #PAL_PARAMETER_TBL(
	"PARAM_NAME" NVARCHAR(256), 
	"INT_VALUE" INTEGER, 
	"DOUBLE_VALUE" DOUBLE, 
	"STRING_VALUE" NVARCHAR(1000)
);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('THREAD_RATIO', NULL, 0.2, NULL);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('GROUP_NUMBER', 4, NULL, NULL);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('INIT_TYPE', 1, NULL, NULL);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('DISTANCE_LEVEL',2, NULL, NULL);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('MAX_ITERATION', 100, NULL, NULL);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('EXIT_THRESHOLD', NULL, 1.0E-6, NULL);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('CATEGORY_WEIGHTS', NULL, 0.5, NULL);

CALL _SYS_AFL.PAL_KMEANS(PAL_KMEANS_DATA_TBL, "#PAL_PARAMETER_TBL", ?, ?, ?, ?, ?);

