Skip to content

1 hidden cell
import pandas as pd

# read tab-limited data in csv file 
avocado = pd.read_csv('data/avocado.csv', sep='\t') 

# subset columns to keep only relevant columns 
subset_cols = ['code', 'lc', 'product_name_en', 'quantity', 'serving_size', 'packaging_tags', 'brands', 'brands_tags', 'categories_tags', 'labels_tags', 'countries', 'countries_tags', 'origins','origins_tags'] 
avocado = avocado[subset_cols] 

# Gather relevant categories data for avocados 
with open('data/relevant_avocado_categories.txt', 'r') as file: 
    relevant_avocado_categories = file.read().split() 
    # relevant_avocado_categories = file.read().splitlines()
    
print('avocado relevant categories: \n', relevant_avocado_categories)

## Filter data using relevant categories 

avocado.dropna(subset=['categories_tags'], inplace=True)
# transform comma separated string values in column into a list of tag words 
avocado.categories_tags = avocado.categories_tags.str.split(',') 

# Filter rows where categories_tags have any of the tags listed in relevant categories
avocado = avocado[avocado.categories_tags.apply(lambda x: bool(set(x) & set(relevant_avocado_categories)))] 
# avocado = avocado[avocado.categories_tags.apply(lambda x: any([i for i in x if i in relevant_avocado_categories]))]



## Where do most avocados come from? 

# Filter DataFrame for UK data 
avocado_uk = avocado.query("countries == 'United Kingdom'")

# Find most common country for avocado origin &  format to strip additional string characters  
top_avocado_origin = avocado_uk.origins_tags.value_counts().index[0].lstrip('en:').replace('-', ' ') 
print('avocado top origin country:', top_avocado_origin, '\n') 


# Define a function to do the above tasks on other ingredients & return its top origin country 
def read_and_filter_data(csvfile, txtfile): 
    df = pd.read_csv('data/' + csvfile, sep = '\t') 
    subset_cols = ['code', 'lc', 'product_name_en', 'quantity', 'serving_size', 'packaging_tags', 'brands', 'brands_tags', 'categories_tags', 'labels_tags', 'countries', 'countries_tags', 'origins','origins_tags'] 
    df = df[subset_cols] 
    
    with open('data/'+txtfile, mode='r') as file: 
        relevant_categories = file.read().split() 
    print(csvfile[:-4], 'relevant categories: \n', relevant_categories) 
    df = df.dropna(subset=['categories_tags']) 
    df.categories_tags = df.categories_tags.str.split(',') 
    df = df[df.categories_tags.apply(lambda x: bool(set(x) & set(relevant_categories)) ) ] 
    df_uk = df.query("countries == 'United Kingdom' ") 
    top_ingredient_origin = df_uk.origins_tags.value_counts().index[0].lstrip('en:').replace('-', ' ') 
    print(csvfile[:-4], 'top origin country:', top_ingredient_origin, '\n') 
    return top_ingredient_origin

# get top origin country for olive oil & sourdough using above function 
top_olive_oil_origin = read_and_filter_data('olive_oil.csv', 'relevant_olive_oil_categories.txt') 
top_sourdough_origin = read_and_filter_data('sourdough.csv', 'relevant_sourdough_categories.txt')