Latent Dirichlet Allocation (LDA) is a generative model used in text modeling. It assumes that each word in a document is generated from a mixture of latent topics. LDA posits that each document consists of multiple topics with different probabilities, and each word belongs to certain topics with different probabilities. Parameter inference in LDA is typically done using Gibbs sampling.
------

SET SCHEMA DM_PAL;

DROP TABLE PAL_LATENT_DIRICHLET_ALLOCATION_DATA_TBL;
CREATE COLUMN TABLE PAL_LATENT_DIRICHLET_ALLOCATION_DATA_TBL ("DOCUMENT_ID" INTEGER, "TEXT" NVARCHAR (5000));
INSERT INTO PAL_LATENT_DIRICHLET_ALLOCATION_DATA_TBL VALUES (10 , 'cpu harddisk graphiccard cpu monitor keyboard cpu memory memory');
INSERT INTO PAL_LATENT_DIRICHLET_ALLOCATION_DATA_TBL VALUES (20 , 'tires mountainbike wheels valve helmet mountainbike rearfender tires mountainbike mountainbike');
INSERT INTO PAL_LATENT_DIRICHLET_ALLOCATION_DATA_TBL VALUES (30 , 'carseat toy strollers toy toy spoon toy strollers toy carseat');
INSERT INTO PAL_LATENT_DIRICHLET_ALLOCATION_DATA_TBL VALUES (40 , 'sweaters sweaters sweaters boots sweaters rings vest vest shoe sweaters');

DROP TABLE PAL_LDA_PARAMETER_TBL;
CREATE COLUMN TABLE PAL_LDA_PARAMETER_TBL ("PARAM_NAME" VARCHAR(256), "INT_VALUE" INTEGER, "DOUBLE_VALUE" DOUBLE, "STRING_VALUE" VARCHAR(1000));
INSERT INTO PAL_LDA_PARAMETER_TBL VALUES ('TOPICS', 6, NULL, NULL);
INSERT INTO PAL_LDA_PARAMETER_TBL VALUES ('BURNIN', 50, NULL, NULL);
INSERT INTO PAL_LDA_PARAMETER_TBL VALUES ('THIN', 10, NULL, NULL);
INSERT INTO PAL_LDA_PARAMETER_TBL VALUES ('ITERATION', 100, NULL, NULL);
INSERT INTO PAL_LDA_PARAMETER_TBL VALUES ('SEED', 1, NULL, NULL);
INSERT INTO PAL_LDA_PARAMETER_TBL VALUES ('ALPHA', NULL, 0.1, NULL);
INSERT INTO PAL_LDA_PARAMETER_TBL VALUES ('MAX_TOP_WORDS', 5, NULL, NULL);
INSERT INTO PAL_LDA_PARAMETER_TBL VALUES ('OUTPUT_WORD_ASSIGNMENT', 1, NULL, NULL);
INSERT INTO PAL_LDA_PARAMETER_TBL VALUES ('DELIMIT', NULL, NULL, ' '||char(13)||char(10));

DROP TABLE PAL_LDA_TOPIC_WORD_DISTRIBUTION_TBL;
DROP TABLE PAL_LDA_DICTIONARY_TBL;
DROP TABLE PAL_LDA_CV_PARAMETER_TBL;
CREATE COLUMN TABLE PAL_LDA_TOPIC_WORD_DISTRIBUTION_TBL ("TOPIC_ID" INTEGER, "WORD_ID" INTEGER, "PROBABILITY" DOUBLE);
CREATE COLUMN TABLE PAL_LDA_DICTIONARY_TBL ("WORD_ID" INTEGER, "WORD" NVARCHAR(5000));
CREATE COLUMN TABLE PAL_LDA_CV_PARAMETER_TBL ("PARAM_NAME" VARCHAR(256), "INT_VALUE" INTEGER, "DOUBLE_VALUE" DOUBLE, "STRING_VALUE" VARCHAR(1000));

DO BEGIN
  lt_data = SELECT * FROM PAL_LATENT_DIRICHLET_ALLOCATION_DATA_TBL;
  lt_param = SELECT * FROM PAL_LDA_PARAMETER_TBL;
  CALL _SYS_AFL.PAL_LATENT_DIRICHLET_ALLOCATION (:lt_data, :lt_param, lt_document_topic_distribution, lt_word_topic_assignment, lt_topic_top_words, lt_topic_word_distribution, lt_dictionary, lt_statistics, lt_cv_parameter);
  INSERT INTO PAL_LDA_TOPIC_WORD_DISTRIBUTION_TBL SELECT * FROM :lt_topic_word_distribution;
  INSERT INTO PAL_LDA_DICTIONARY_TBL SELECT * FROM :lt_dictionary;
  INSERT INTO PAL_LDA_CV_PARAMETER_TBL SELECT * FROM :lt_cv_parameter;
END;

