The random decision trees algorithm is an ensemble learning method that uses multiple classification and regression trees to make predictions. It combines bagging and random feature selection techniques to create a diverse set of trees. The algorithm can handle imbalanced data and has a method for estimating missing data. It also provides an unbiased estimate of the generalization error without the need for cross-validation.
------

SET SCHEMA DM_PAL;

DROP TABLE  PAL_RDT_DATA_TBL;
CREATE COLUMN TABLE PAL_RDT_DATA_TBL (
	"OUTLOOK" VARCHAR(20),
	"TEMP" INTEGER,
	"HUMIDITY" DOUBLE,
	"WINDY" VARCHAR(10),
	"CLASS" VARCHAR(20)
);
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Sunny', 75, 70.0, 'Yes', 'Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Sunny', NULL, 90.0, 'Yes', 'Do not Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Sunny', 85, NULL, 'No', 'Do not Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Sunny', 72, 95.0, 'No', 'Do not Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES (NULL, NULL, 70.0, NULL, 'Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Overcast', 72.0, 90, 'Yes', 'Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Overcast', 83.0, 78, 'No', 'Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Overcast', 64.0, 65, 'Yes', 'Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Overcast', 81.0, 75, 'No', 'Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES (NULL, 71, 80.0, 'Yes', 'Do not Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Rain', 65, 70.0, 'Yes', 'Do not Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Rain', 75, 80.0, 'No', 'Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Rain', 68, 80.0, 'No', 'Play');
INSERT INTO PAL_RDT_DATA_TBL VALUES ('Rain', 70, 96.0, 'No', 'Play');

DROP TABLE PAL_PARAMETER_TBL;
CREATE COLUMN TABLE PAL_PARAMETER_TBL (
	"PARAM_NAME" VARCHAR (100), 
	"INT_VALUE" INTEGER, 
	"DOUBLE_VALUE" DOUBLE, 
	"STRING_VALUE" VARCHAR (100)
);

INSERT INTO PAL_PARAMETER_TBL VALUES ('TREES_NUM', 300, NULL, NULL);
INSERT INTO PAL_PARAMETER_TBL VALUES ('TRY_NUM', 3, NULL, NULL);
INSERT INTO PAL_PARAMETER_TBL VALUES ('SEED', 2, NULL, NULL);
INSERT INTO PAL_PARAMETER_TBL VALUES ('COMPRESSION', 0, NULL, NULL);
INSERT INTO PAL_PARAMETER_TBL VALUES ('SPLIT_THRESHOLD', NULL, 1e-5, NULL);
INSERT INTO PAL_PARAMETER_TBL VALUES ('CALCULATE_OOB', 1, NULL, NULL);
INSERT INTO PAL_PARAMETER_TBL VALUES ('NODE_SIZE', 1, NULL, NULL);
INSERT INTO PAL_PARAMETER_TBL VALUES ('THREAD_RATIO', NULL, 1.0, NULL);
INSERT INTO PAL_PARAMETER_TBL VALUES ('MODEL_FORMAT', 2, NULL, NULL);
--A possible setting for stratified sampling
--INSERT INTO PAL_PARAMETER_TBL VALUES ('SAMPLE_FRACTION', NULL, 1, NULL);
--INSERT INTO PAL_PARAMETER_TBL VALUES ('STRATA', NULL, 0.5, 'Do not Play');  -- for class ¡®Do not Play¡¯
--INSERT INTO PAL_PARAMETER_TBL VALUES ('STRATA', NULL, 0.5, 'Play');  -- for class ¡®Play¡¯

-- A possible setting for prior probability
--INSERT INTO PAL_PARAMETER_TBL VALUES ('PRIORS', NULL, 0.45, 'Do not Play');  -- for class ¡®Do not Play¡¯
--INSERT INTO PAL_PARAMETER_TBL VALUES ('PRIORS', NULL, 0.55, 'Play');  -- for class ¡®Play¡¯


DROP TABLE PAL_RDT_MODEL_TBL;  -- for predict followed
CREATE COLUMN TABLE PAL_RDT_MODEL_TBL (
	"ROW_INDEX" INTEGER,
	"TREE_INDEX" INTEGER,
	"MODEL_CONTENT" NVARCHAR(5000)
);


do begin
	lt_data_train = select * from PAL_RDT_DATA_TBL;

	param_table =   select * from PAL_PARAMETER_TBL;
	
	CALL _SYS_AFL.PAL_RANDOM_DECISION_TREES (:lt_data_train, :param_table, model_table, var_imp, out_of_bag, confusion);
	TRUNCATE TABLE PAL_RDT_MODEL_TBL;
	insert into PAL_RDT_MODEL_TBL select * from :model_table;
	select * from PAL_RDT_MODEL_TBL;
	select * from :var_imp;
	select * from :out_of_bag;
	select * from :confusion;
end;
