import os
import numpy as np
from time import time
from qiskit import QuantumCircuit, transpile
from qiskit.circuit.library import QFT
from qiskit_aer import AerSimulator
from qgear.toolbox.Util_CudaQ import qiskit_to_gateList
from qgear.runner import run_cudaq
from qgear.toolbox.Util_H5io4 import write4_data_hdf5, read4_data_hdf5
from qiskit.quantum_info import Statevector
from qiskit_aer import StatevectorSimulator
def benchmark_qft(n_qubits=5, numShots=1000, backend_cudaq="nvidia", verb=0):
"""
Compare CUDA-Q vs Qiskit Aer for QFT circuit.
"""
# ----------------------------
# 1. Create QFT circuit
qc = QFT(num_qubits=n_qubits, do_swaps=False).decompose()
sv_qiskit = Statevector(qc)
qc.measure_all()
qcEL = [qc]
# print("\n--- Qiskit Circuit ---")
# print(qc.draw("text"))
# ----------------------------
# 2. CUDA-Q path
out_path = "out"
os.makedirs(out_path, exist_ok=True)
circ_name = f"qft_{n_qubits}q"
gateD, md = qiskit_to_gateList(qcEL)
# dont save for now
inpF = os.path.join(out_path, circ_name + ".gate_list.h5")
md["short_name"] = circ_name
write4_data_hdf5(gateD, inpF, md)
gateD, MD = read4_data_hdf5(inpF, verb)
T0 = time()
resL_cudaq,stateL = run_cudaq(gateD, numShots, verb=verb, backend=backend_cudaq)
sv_cudaq = stateL[0]
resL_cudaq = counts_cudaq_to_qiskit(resL_cudaq)
# Fidelity check
# print(resL_cudaq[0])
ela_cudaq = time() - T0
# ----------------------------
# 3. Qiskit Aer path
aer_backend = AerSimulator()
qcT = transpile(qc, aer_backend)
T1 = time()
job = aer_backend.run(qcT, shots=numShots)
res_aer = job.result().get_counts()
fidelity = abs(np.vdot(sv_qiskit, np.array(sv_cudaq)))**2
print("Statevector fidelity:", fidelity)
# print(res_aer)
ela_aer = time() - T1
# ----------------------------
# 4. Report
print(f"\n=== QFT {n_qubits} qubits ===")
print(f"CUDA-Q ({backend_cudaq}): {ela_cudaq:.4f} s")
print(f"AerSimulator: {ela_aer:.4f} s")
speedup = ela_aer / ela_cudaq if ela_cudaq > 0 else float('inf')
print(f"Speedup (Aer / CUDA-Q): {speedup:.2f}x" if speedup > 1 else f"CUDA-Q slower by {1/speedup:.2f}x")
return ela_cudaq, ela_aer
# ----------------------------
# Run benchmarks for increasing QFT sizes
results = []
for nq in [3, 6, 9, 28]:
results.append((nq, *benchmark_qft(n_qubits=nq, numShots=2000, backend_cudaq="nvidia", verb=1)))
# ----------------------------
# Optional: Plot speedup
import matplotlib.pyplot as plt
nq_list = [r[0] for r in results]
cudaq_times = [r[1] for r in results]
aer_times = [r[2] for r in results]
speedups = [a/b for a, b in zip(aer_times, cudaq_times)]
plt.figure(figsize=(6,4))
plt.plot(nq_list, speedups, marker='o')
plt.xlabel("Number of Qubits (QFT)")
plt.ylabel("Speedup (Aer / CUDA-Q)")
plt.title("QFT Speedup: CUDA-Q vs Qiskit Aer")
plt.grid(True)
plt.show()
qiskit_to_gateList: nGate 10
saving data as hdf5: out/qft_3q.gate_list.h5
h5-write : circ_type (1, 2) int32
h5-write : gate_type (1, 10, 3) int32
h5-write : gate_param (1, 10) float32
h5-write : meta.JSON as string (1,) object
closed hdf5: out/qft_3q.gate_list.h5 size=0.01 MB, elaT=0.0 sec
read data from hdf5: out/qft_3q.gate_list.h5
read obj: circ_type (1, 2) int32
read obj: gate_param (1, 10) float32
read obj: gate_type (1, 10, 3) int32
read str: meta.JSON 1 <class 'numpy.ndarray'>
done h5, num rec:3 elaT=0.0 sec
╭───╮
q0 : ─────────────────────────────●─────────────●──────┤ h ├
╭───╮ │ ╭─────┴─────╮╰───╯
q1 : ───────────●──────┤ h ├──────┼───────┤ r1(1.571) ├─────
╭───╮╭─────┴─────╮╰───╯╭─────┴──────╮╰───────────╯
q2 : ┤ h ├┤ r1(1.571) ├─────┤ r1(0.7854) ├──────────────────
╰───╯╰───────────╯ ╰────────────╯
Statevector fidelity: 0.9999999657714577
=== QFT 3 qubits ===
CUDA-Q (nvidia): 0.0033 s
AerSimulator: 0.1599 s
Speedup (Aer / CUDA-Q): 48.00x
qiskit_to_gateList: nGate 28
saving data as hdf5: out/qft_6q.gate_list.h5
h5-write : circ_type (1, 2) int32
h5-write : gate_type (1, 28, 3) int32
h5-write : gate_param (1, 28) float32
h5-write : meta.JSON as string (1,) object
closed hdf5: out/qft_6q.gate_list.h5 size=0.01 MB, elaT=0.0 sec
read data from hdf5: out/qft_6q.gate_list.h5
read obj: circ_type (1, 2) int32
read obj: gate_param (1, 28) float32
read obj: gate_type (1, 28, 3) int32
read str: meta.JSON 1 <class 'numpy.ndarray'>
done h5, num rec:3 elaT=0.0 sec
Statevector fidelity: 0.9999997615814347
=== QFT 6 qubits ===
CUDA-Q (nvidia): 0.0097 s
AerSimulator: 0.1722 s
Speedup (Aer / CUDA-Q): 17.69x
qiskit_to_gateList: nGate 55
saving data as hdf5: out/qft_9q.gate_list.h5
h5-write : circ_type (1, 2) int32
h5-write : gate_type (1, 55, 3) int32
h5-write : gate_param (1, 55) float32
h5-write : meta.JSON as string (1,) object
closed hdf5: out/qft_9q.gate_list.h5 size=0.01 MB, elaT=0.0 sec
read data from hdf5: out/qft_9q.gate_list.h5
read obj: circ_type (1, 2) int32
read obj: gate_param (1, 55) float32
read obj: gate_type (1, 55, 3) int32
read str: meta.JSON 1 <class 'numpy.ndarray'>
done h5, num rec:3 elaT=0.0 sec
Statevector fidelity: 0.9999997971840712
=== QFT 9 qubits ===
CUDA-Q (nvidia): 0.0061 s
AerSimulator: 0.1917 s
Speedup (Aer / CUDA-Q): 31.66x
qiskit_to_gateList: nGate 435
saving data as hdf5: out/qft_28q.gate_list.h5
h5-write : circ_type (1, 2) int32
h5-write : gate_type (1, 435, 3) int32
h5-write : gate_param (1, 435) float32
h5-write : meta.JSON as string (1,) object
closed hdf5: out/qft_28q.gate_list.h5 size=0.01 MB, elaT=0.1 sec
read data from hdf5: out/qft_28q.gate_list.h5
read obj: circ_type (1, 2) int32
read obj: gate_param (1, 435) float32
read obj: gate_type (1, 435, 3) int32
read str: meta.JSON 1 <class 'numpy.ndarray'>
done h5, num rec:3 elaT=0.0 sec
Statevector fidelity: 0.9999990463259107
=== QFT 28 qubits ===
CUDA-Q (nvidia): 0.3895 s
AerSimulator: 28.0898 s
Speedup (Aer / CUDA-Q): 72.12x