# Quality Assurance <a href="https://colab.research.google.com/github/mostly-ai/mostlyai/blob/main/docs/tutorials/quality-assurance/quality-assurance.ipynb" target="_blank"><img src="https://img.shields.io/badge/Open%20in-Colab-blue?logo=google-colab" alt="Run on Colab"></a>

In this tutorial we will leverage `mostlyai-qa`, the open-source Python toolkit to assess Synthetic Data quality. See also https://mostly-ai.github.io/mostlyai-qa/ for more info on that toolkit.

In [None]:
%pip install -U mostlyai-qa

In [None]:
import webbrowser
import pandas as pd
from mostlyai import qa

# initialize logging to stdout
qa.init_logging()

# print version
print(f"loaded mostlyai-qa {qa.__version__}")

In [None]:
repo = "https://github.com/mostly-ai/paper-fidelity-accuracy/raw/refs/heads/main/2024-12/data"
trn = pd.read_csv(f"{repo}/adult_trn.csv.gz")
hol = pd.read_csv(f"{repo}/adult_hol.csv.gz")
syn = pd.read_csv(f"{repo}/adult_mostlyai.csv.gz")
print(f"fetched training data with {trn.shape[0]:,} records and {trn.shape[1]} attributes")
print(f"fetched holdout data with {hol.shape[0]:,} records and {hol.shape[1]} attributes")
print(f"fetched synthetic data with {syn.shape[0]:,} records and {syn.shape[1]} attributes")

In [None]:
trn.sample(n=3)

In [None]:
syn.sample(n=3)

## Generate HTML Report with Metrics

In [None]:
# takes about 1-2 minutes
report_path, metrics = qa.report(
    syn_tgt_data=syn,
    trn_tgt_data=trn,
    hol_tgt_data=hol,
    max_sample_size_embeddings=1_000,  # set limit to speed up demo; remove limit for best measures
)

# pretty print metrics
print(metrics.model_dump_json(indent=4))

# open up HTML report in new browser window
webbrowser.open(f"file://{report_path.absolute()}")