-- Extensions (must be created first)
CREATE EXTENSION IF NOT EXISTS vector CASCADE;

CREATE EXTENSION IF NOT EXISTS vchord CASCADE;

CREATE EXTENSION IF NOT EXISTS pg_tokenizer CASCADE;

CREATE EXTENSION IF NOT EXISTS vchord_bm25 CASCADE;

-- Table: documents (versioned documentation)
CREATE TABLE IF NOT EXISTS documents (
    id SERIAL PRIMARY KEY,
    name TEXT NOT NULL,
    version TEXT NOT NULL,
    source_url TEXT,
    description TEXT,
    metadata JSONB DEFAULT '{}'::jsonb,
    created_at TIMESTAMPTZ DEFAULT NOW(),
    updated_at TIMESTAMPTZ DEFAULT NOW(),
    UNIQUE (name, version)
);

-- Table: pages (raw crawled content)
CREATE TABLE IF NOT EXISTS pages (
    id SERIAL PRIMARY KEY,
    document_id INTEGER NOT NULL REFERENCES documents (id) ON DELETE CASCADE,
    url TEXT NOT NULL UNIQUE,
    content TEXT NOT NULL,
    content_hash TEXT NOT NULL,
    content_length INTEGER GENERATED ALWAYS AS (length(content)) STORED,
    crawled_at TIMESTAMPTZ DEFAULT NOW(),
    status TEXT DEFAULT 'pending' CHECK (
        status IN (
            'pending',
            'processing',
            'chunked',
            'deleted'
        )
    ),
    group_id UUID, -- For future grouping feature
    metadata JSONB DEFAULT '{}'::jsonb
);

-- Table: chunks (embedded content)
CREATE TABLE IF NOT EXISTS chunks (
    id SERIAL PRIMARY KEY,
    document_id INTEGER NOT NULL REFERENCES documents (id) ON DELETE CASCADE,
    chunk_index INTEGER NOT NULL,
    content TEXT NOT NULL,
    group_id UUID, -- For future grouping feature
    embedding VECTOR (768), -- Dimension must match config
    bm25_vector bm25vector, -- Auto-generated by trigger
    created_at TIMESTAMPTZ DEFAULT NOW(),
    UNIQUE (
        document_id,
        group_id,
        chunk_index
    )
);

-- Indexes for performance
CREATE INDEX IF NOT EXISTS idx_pages_document ON pages (document_id);

CREATE INDEX IF NOT EXISTS idx_pages_status ON pages (status);

CREATE INDEX IF NOT EXISTS idx_pages_hash ON pages (content_hash);

CREATE INDEX IF NOT EXISTS idx_pages_group ON pages (group_id);

CREATE INDEX IF NOT EXISTS idx_chunks_document ON chunks (document_id);

CREATE INDEX IF NOT EXISTS idx_chunks_group ON chunks (group_id);

CREATE INDEX IF NOT EXISTS idx_chunks_vector ON chunks USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);

CREATE INDEX IF NOT EXISTS idx_chunks_bm25 ON chunks USING bm25 (bm25_vector bm25_ops);

-- Trigger: Auto-generate bm25_vector from content
CREATE OR REPLACE FUNCTION generate_bm25_vector()
RETURNS TRIGGER AS $$
BEGIN
    NEW.bm25_vector := tokenize(NEW.content, 'bert');
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

DROP TRIGGER IF EXISTS chunks_bm25_trigger ON chunks;

CREATE TRIGGER chunks_bm25_trigger
BEFORE INSERT OR UPDATE OF content ON chunks
FOR EACH ROW
EXECUTE FUNCTION generate_bm25_vector();

-- Trigger: Update documents.updated_at
CREATE OR REPLACE FUNCTION update_document_timestamp()
RETURNS TRIGGER AS $$
BEGIN
    UPDATE documents SET updated_at = NOW() WHERE id = NEW.document_id;
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

DROP TRIGGER IF EXISTS update_doc_on_page_insert ON pages;

CREATE TRIGGER update_doc_on_page_insert
AFTER INSERT ON pages
FOR EACH ROW
EXECUTE FUNCTION update_document_timestamp();

DROP TRIGGER IF EXISTS update_doc_on_chunk_insert ON chunks;

CREATE TRIGGER update_doc_on_chunk_insert
AFTER INSERT ON chunks
FOR EACH ROW
EXECUTE FUNCTION update_document_timestamp();