Skip to content

Install Great Expectations

  • Check the version to see if it's correctly installed.
!pip install great_expectations
Hidden output
import great_expectations as gx

print(gx.__version__)

Generate sample Dataframes

  • Use the Lego Postgres database sample data and create two DataFrames: inventory_parts_df and colors_df.
Spinner
DataFrameas
inventory_parts_df
variable
-- Explore the data in the table
SELECT *
FROM inventory_parts

Connect GX to the Dataframe data

  • Create a Data Source, a Data Asset and a Batch that will include the whole source DataFrame
context = gx.get_context()

data_source = context.data_sources.add_pandas(name="inventory_parts")
data_asset = data_source.add_dataframe_asset(name="inventory_parts_asset")

# Retrieve the Data Asset
data_source_name = "inventory_parts"
data_asset_name = "inventory_parts_asset"
data_asset = context.data_sources.get(data_source_name).get_asset(data_asset_name)

# Define the Batch Definition name
batch_definition_name = "inventory_parts_batch"

# Add a Batch Definition to the Data Asset
batch_definition = data_asset.add_batch_definition_whole_dataframe(
    batch_definition_name
)
assert batch_definition.name == batch_definition_name

Create Expectations

  • Retrieve the previously created Batch, using the inventory_parts_df as parameter to run the Expectations against it
  • Create a Suite and add two simple Expectations to it
# Retrieve the dataframe Batch Definition
batch_definition = (
    context.data_sources.get(data_source_name)
    .get_asset(data_asset_name)
    .get_batch_definition(batch_definition_name)
)

# Add the DataFrame as batch parameters
batch_parameters = {"dataframe": inventory_parts_df}

# Step 3: Retrieve the Batch
batch = batch_definition.get_batch(batch_parameters=batch_parameters)

# Step 4: Define Expectations
expectation_suite_name = "inventory_parts_suite"
suite = gx.ExpectationSuite(name=expectation_suite_name)

# Add Expectations to Suite
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="inventory_id")
)

suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="part_num")
)

# Add the Expectation Suite to the Context
context.suites.add(suite)

Validate the Expectations

  • Run the Suite of expectations against the source data in the inventory_parts_df DataFrame
# Validate the Data Against the Suite
validation_results = batch.validate(suite)

# Evaluate the Results
print(validation_results)