Skip to content
1 hidden cell
Project: What's in an Avocado Toast: A Supply Chain Analysis
1 hidden cell
import pandas as pd
# read tab-limited data in csv file
avocado = pd.read_csv('data/avocado.csv', sep='\t')
# subset columns to keep only relevant columns
subset_cols = ['code', 'lc', 'product_name_en', 'quantity', 'serving_size', 'packaging_tags', 'brands', 'brands_tags', 'categories_tags', 'labels_tags', 'countries', 'countries_tags', 'origins','origins_tags']
avocado = avocado[subset_cols]
# Gather relevant categories data for avocados
with open('data/relevant_avocado_categories.txt', 'r') as file:
relevant_avocado_categories = file.read().split()
# relevant_avocado_categories = file.read().splitlines()
print('avocado relevant categories: \n', relevant_avocado_categories)
## Filter data using relevant categories
avocado.dropna(subset=['categories_tags'], inplace=True)
# transform comma separated string values in column into a list of tag words
avocado.categories_tags = avocado.categories_tags.str.split(',')
# Filter rows where categories_tags have any of the tags listed in relevant categories
avocado = avocado[avocado.categories_tags.apply(lambda x: bool(set(x) & set(relevant_avocado_categories)))]
# avocado = avocado[avocado.categories_tags.apply(lambda x: any([i for i in x if i in relevant_avocado_categories]))]
## Where do most avocados come from?
# Filter DataFrame for UK data
avocado_uk = avocado.query("countries == 'United Kingdom'")
# Find most common country for avocado origin & format to strip additional string characters
top_avocado_origin = avocado_uk.origins_tags.value_counts().index[0].lstrip('en:').replace('-', ' ')
print('avocado top origin country:', top_avocado_origin, '\n')
# Define a function to do the above tasks on other ingredients & return its top origin country
def read_and_filter_data(csvfile, txtfile):
df = pd.read_csv('data/' + csvfile, sep = '\t')
subset_cols = ['code', 'lc', 'product_name_en', 'quantity', 'serving_size', 'packaging_tags', 'brands', 'brands_tags', 'categories_tags', 'labels_tags', 'countries', 'countries_tags', 'origins','origins_tags']
df = df[subset_cols]
with open('data/'+txtfile, mode='r') as file:
relevant_categories = file.read().split()
print(csvfile[:-4], 'relevant categories: \n', relevant_categories)
df = df.dropna(subset=['categories_tags'])
df.categories_tags = df.categories_tags.str.split(',')
df = df[df.categories_tags.apply(lambda x: bool(set(x) & set(relevant_categories)) ) ]
df_uk = df.query("countries == 'United Kingdom' ")
top_ingredient_origin = df_uk.origins_tags.value_counts().index[0].lstrip('en:').replace('-', ' ')
print(csvfile[:-4], 'top origin country:', top_ingredient_origin, '\n')
return top_ingredient_origin
# get top origin country for olive oil & sourdough using above function
top_olive_oil_origin = read_and_filter_data('olive_oil.csv', 'relevant_olive_oil_categories.txt')
top_sourdough_origin = read_and_filter_data('sourdough.csv', 'relevant_sourdough_categories.txt')