# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/runner.ipynb.

# %% auto 0
__all__ = ['prepare_runner_args', 'run_qiskit_aer', 'run_cudaqft', 'input_shard', 'run_gate_job', 'rank_print',
           'canned_qcrank_inp', 'make_qcrank', 'harvest_cudaq_backRun_submitMeta', 'run_cudaq', 'run_qcrank']

# %% ../nbs/runner.ipynb 1
import os, re, random, psutil, cudaq
from time import time, localtime
from pprint import pprint
from qiskit_aer import AerSimulator
import os
import numpy as np
from pprint import pprint
from qiskit import transpile
import hashlib
from .toolbox.Util_ibm import harvest_circ_transpMeta
from .toolbox.Util_H5io4 import write4_data_hdf5, read4_data_hdf5
from .toolbox.Util_IOfunc import dateT2Str, write_yaml
from .toolbox.Util_CudaQ import circ_kernel, counts_cudaq_to_qiskit, qiskit_to_gateList, qft_kernel
from .toolbox.Util_Qiskit import pack_counts_to_numpy, circ_depth_aziz
from .datacircuits import qcrank
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

# %% ../nbs/runner.ipynb 2
def prepare_runner_args(args):
    "Process args: set paths, MPI ranks, and create output directories"
    if args.basePath == 'env':
        args.basePath = os.environ['Cudaq_dataVault']
    if args.basePath:
        args.inpPath = os.path.join(args.basePath, 'circ')
        args.outPath = os.path.join(args.basePath, 'meas')

    args.myRank, args.numRank = 0, 1

    if args.myRank == 0:
        for arg in vars(args):  
            print('myArg:', arg, getattr(args, arg))
        os.makedirs(args.outPath, exist_ok=True)
    else:
        args.verb = 0

    assert os.path.exists(args.inpPath)
    return args

# %% ../nbs/runner.ipynb 3
def run_qiskit_aer(qcL, shots):
    "Run Qiskit Aer simulation"
    backend = AerSimulator()
    job = backend.run(qcL, shots=shots)
    result = job.result()
    return result.get_counts()

# %% ../nbs/runner.ipynb 4
def run_cudaqft(shots, num_gpus, num_qubit, nc=1, target="nvidia", verb=1):
    "Run CUDA-Q QFT circuits"
    resL = [0] * nc
    for i in range(nc):
        input_state = [random.choice([0, 1]) for _ in range(num_qubit)]
        results = cudaq.sample(qft_kernel, input_state, shots_count=shots)
        resL[i] = results
    print('RCQ: done', len(resL[0]), target)
    return resL, target

# %% ../nbs/runner.ipynb 5
def input_shard(bigD, myRank, numRank, verb=1):
    "Shard dataset across MPI ranks"
    if verb > 0: 
        print(f'Shard for rank={myRank} of {numRank}')
    totSamp = bigD['circ_type'].shape[0]
    assert totSamp % numRank == 0
    shardSize = totSamp // numRank
    if verb > 0: 
        print(f'select {myRank}-shard of size {shardSize}')
    iOff = myRank * shardSize
    for xx in bigD:
        bigD[xx] = bigD[xx][iOff:iOff+shardSize]
    return shardSize

# %% ../nbs/runner.ipynb 6
def run_gate_job(
    exp: str,
    backend: str = "nvidia",
    numshots: int = 1024,
    basePath: str = None,
    qft: bool = False,
    target_option: str = "fp32",
    verbosity: int = 1
):
    """
    Run a gate list experiment.

    Args:
        exp: Experiment name (without `.gate_list.h5`)
        backend: Backend to use ('nvidia', 'qiskit-cpu', 'tensornet', 'qpp-cpu')
        numshots: Shots per circuit
        basePath: Base directory for input/output (or 'env' to use $Cudaq_dataVault)
        qft: If True, run QFT kernel instead of gate list
        target_option: Target options (default 'fp32')
        verbosity: Verbosity level (0-3)
    """
    # === Resolve paths ===
    basePath = os.environ.get('Cudaq_dataVault', os.getcwd())
    if basePath:
        inpPath = os.path.join(basePath, 'circ')
        outPath = os.path.join(basePath, 'meas')
    os.makedirs(outPath, exist_ok=True)

    # === MPI rank handling ===
    myRank, numRank = 0, 1
    # === Load input circuit ===
    inpF = f"{exp}.gate_list.h5"
    if not qft:
        gateD, MD = read4_data_hdf5(os.path.join(inpPath, inpF), verbosity)
    else:
        gateD, MD = {}, {'short_name': exp}

    if backend == 'qiskit-cpu':
        shardSize = gateD['circ_type'].shape[0] // numRank
        iOff = myRank * shardSize
        for xx in gateD:
            gateD[xx] = gateD[xx][iOff:iOff+shardSize]
        MD['num_circ'] = shardSize
        MD['my_rank'] = myRank
        MD['num_rank'] = numRank
        MD['cores'] = cores
        MD['tasks_per_node'] = tasks_per_node

    nCirc = MD.get('num_circ', 1)
    # === Backend execution ===
    if 'qiskit' in backend:
        qcL = qiskit_circ_gateList(gateD, MD)
        backend_sim = AerSimulator()
        job = backend_sim.run(qcL, shots=numshots)
        resL = job.result().get_counts()
        MD['cpu_info'] = get_cpu_info(verb=0)
        target2 = 'par-cpu'

    elif qft:
        num_gpus = cudaq.num_available_gpus()
        resL = []
        for _ in range(nCirc):
            input_state = [random.choice([0, 1]) for _ in range(MD['num_qubit'])]
            results = cudaq.sample(qft_kernel, input_state, shots_count=numshots)
            resL.append(results)
        MD['num_gpus'] = num_gpus
        target2 = 'adj-gpu'

    else:
        cudaq.set_target(backend, option=target_option)
        num_qpus = cudaq.get_target().num_qpus()
        resL = []
        for i in range(nCirc):
            num_qubit, num_gate = map(int, gateD['circ_type'][i])
            gate_type = list(map(int, gateD['gate_type'][i].flatten()))
            gate_param = list(map(float, gateD['gate_param'][i]))
            results = cudaq.sample(circ_kernel, num_qubit, num_gate, gate_type, gate_param, shots_count=numshots)
            resL.append(results)
        MD['num_qpus'] = num_qpus
        target2 = 'adj-gpu'
    # === Metadata update ===
    MD.update({
        'elapsed_time': time(),
        'target': backend,
        'date': dateT2Str(),
        'num_meas_strings': [len(x) for x in resL],
        'target2': target2,
        'num_shots': numshots
    })

    # === Save output ===
    outF = os.path.join(outPath, f"{MD['short_name']}_{target2}_{target_option}.yaml")
    write_yaml(MD, outF)

    if verbosity:
        print(f"M:done {MD['short_name']} elaT={MD['elapsed_time']:.1f} sec")
        pprint(MD)

    return MD

# %% ../nbs/runner.ipynb 7
"""
QCrank GPU Simulator Runner (nbdev version, no MPI)

Runs QCrank simulations locally with CUDA-Q.
Input is a serialized gate list from Util_CudaQ: qiskit_to_gateList().
"""

# ----------------------------
def rank_print(*args, **kwargs):
    """Simplified rank_print (no MPI), always prints."""
    print(*args, **kwargs)

# ----------------------------
def canned_qcrank_inp(inp_path: str, circ_name: str, num_shot_per_addr: int):
    """Load prepacked QCrank HDF5 input and update metadata with shot count."""
    inp_file = os.path.join(inp_path, circ_name + '.qcrank_inp.h5')
    bigD, md = read4_data_hdf5(inp_file)

    sd = {}
    sd['num_shots'] = num_shot_per_addr * md['payload']['seq_len']
    md['submit'] = sd
    return bigD, md

# ----------------------------
def make_qcrank(md, barrier=True):
    """Create a parameterized QCrank circuit object."""
    pmd = md['payload']
    nq_addr = pmd['nq_addr']
    nq_data = pmd['nq_fdata']

    qcrankObj = qcrank.ParametrizedQCRANK(
        nq_addr, nq_data,
        qcrank.QKAtan2DecoderQCRANK,
        keep_last_cx=True, barrier=barrier,
        measure=True, statevec=False,
        reverse_bits=True
    )
    return qcrankObj

# ----------------------------
def harvest_cudaq_backRun_submitMeta(md, backend: str, exp_name: str = None):
    """Fill metadata with backend run info."""
    sd = md['submit']
    sd['backend'] = backend
    t1 = localtime()
    sd['date'] = dateT2Str(t1)
    sd['unix_time'] = int(time())

    myHN = hashlib.md5(os.urandom(32)).hexdigest()[:6]
    md['hash'] = myHN
    name = 'cudaq_' + md['hash']
    md['short_name'] = name if exp_name is None else exp_name

# ----------------------------
def run_cudaq(gateD, shots, verb=1, backend="qpp-cpu"):
    """Run CUDA-Q simulation for all circuits in gateD."""
    cudaq.set_target(backend)
    nc = len(gateD['circ_type'])
    resL = [0] * nc
    stateL = [0] * nc
    for i in range(nc):
        num_qubit, num_gate = map(int, gateD['circ_type'][i])
        gate_type = list(map(int, gateD['gate_type'][i].flatten()))
        gate_param = list(map(float, gateD['gate_param'][i]))
        assert num_gate <= len(gate_param)
        prOn = num_qubit < 6 and i == 0 or verb > 1
        
        if prOn:
            print(cudaq.draw(circ_kernel, num_qubit, num_gate, gate_type, gate_param))

        results = cudaq.sample(circ_kernel, num_qubit, num_gate, gate_type, gate_param, shots_count=shots)
        state = cudaq.get_state(circ_kernel, num_qubit, num_gate, gate_type, gate_param)
        resL[i] = results
        stateL[i] = state
    return resL, stateL

# ----------------------------
def run_qcrank(
    circ_name: str,
    inp_path: str = "out",
    out_path: str = "out",
    backend: str = "nvidia",
    num_shot_per_addr: int = 400,
    exp_name: str = None,
    verb: int = 1
):
    """
    Run a QCrank simulation with CUDA-Q.

    Args:
        circ_name: Circuit name without extension.
        inp_path: Path to input .qcrank_inp.h5 file.
        out_path: Directory for outputs.
        backend: CUDA-Q backend target.
        num_shot_per_addr: Shots per address.
        exp_name: Optional experiment name to override auto-generated job ID.
        verb: Verbosity level.
    """
    os.makedirs(out_path, exist_ok=True)

    # Load input
    expD, expMD = canned_qcrank_inp(inp_path, circ_name, num_shot_per_addr)
    if verb:
        pprint(expMD)

    numShots = expMD['submit']['num_shots']
    cudaq.set_target(backend)

    if verb:
        rank_print(f"M: using backend={backend}, total shots={numShots}")

    # Build circuit
    qcrankObj = make_qcrank(expMD)
    qcP = qcrankObj.circuit
    nqTot = qcP.num_qubits
    rank_print(f"M: circuit has {qcP.num_qubits} qubits")
    circ_depth_aziz(qcP, text='circ_orig')

    backend_aer = AerSimulator()
    qcT = transpile(qcP, backend_aer, basis_gates=['cx', 'ry', 'h'])

    if qcP.num_qubits < 6 and verb:
        rank_print("M: PARAMETRIZED TRANSPILED CIRCUIT:")
        rank_print(qcT.draw(output='text', idle_wires=False))

    harvest_circ_transpMeta(qcT, expMD, backend)

    # Bind data
    f_data = expD['inp_fdata']
    qcrankObj.bind_data(f_data, max_val=expMD['payload']['qcrank_max_fval'])

    # Instantiate circuits
    qcEL = qcrankObj.instantiate_circuits()
    nCirc = len(qcEL)
    rank_print(f"M: execution-ready {nCirc} circuits on {nqTot} qubits on {backend}")

    # Convert to gate list
    outD, md = qiskit_to_gateList(qcEL)
    inpF = os.path.join(out_path, circ_name + '.gate_list.h5')
    md['short_name'] = circ_name
    write4_data_hdf5(outD, inpF, md)
    gateD, MD = read4_data_hdf5(inpF, verb)

    if verb:
        print(f"M: job {circ_name} started, nCirc={nCirc}, nq={MD['num_qubit']}, shots/circ={num_shot_per_addr}, target={backend}")

    # Run CUDA-Q
    T0 = time()
    resL,stateL = run_cudaq(gateD, numShots, verb=verb)
    elaT = time() - T0
    rank_print(f"RCQ: done {len(resL[0])} {backend}, elapsed {elaT:.2f}s")

    harvest_cudaq_backRun_submitMeta(expMD, backend, exp_name)

    # Convert results
    countsL = counts_cudaq_to_qiskit(resL)
    pp0 = countsL[0]

    qa = {
        'status': 'JobStatus.DONE',
        'num_circ': nCirc,
        'num_clbits': len(next(iter(pp0.keys()))),
        'device': 'GPU',
        'method': 'statevector',
        'noise': 'ideal',
        'shots': numShots,
        'time_taken': elaT
    }
    if verb:
        rank_print(f"Job QA: {qa}")

    expMD['job_qa'] = qa
    pack_counts_to_numpy(expMD, expD, countsL)

    # Save results
    outF = os.path.join(out_path, expMD['short_name'] + '.h5')
    write4_data_hdf5(expD, outF, expMD)

    return expMD, expD
