Skip to content

Raspberry Yield

Raspberry Yield Analysis is a real life project with real life data from a farm I lived on in Vancouver Island. The season of 2025 was weak in fruit production compared to the 2021-2024 seasons. That made me question how different factors affect fruit production.

My assumptions are:

  1. Temperature is a key factor and has positive correlation to production.
  2. There is a certain window of minimum and maximum temperatures which is optimal.
  3. Wind has a negative correlation to production.
  4. There is a delayed effect of 24 hours on production for any weather factor. - or in other words: today's production is correlated with yesterday's weather.

The data orifginates from my logs recording the daily production of the end product. I was the only person to seal the frozen bags which meant I could follow the exact production numbers from each day of the harvesting season. The end product is a 1.8kgs of frozen raspberries packed into large zipper bags and put on an electronic scale to ensure it weighs the magical 1.8 kg plus minus 10 grams. I used this as my original measuring unit for production due to its accuracy and integrity of this data metric. The production includes sorting, freezing and bagging and takes 24 hours to complete on average. So in other words: Today's product was picked in the field yesterday. So 4 bags that were recorded as today's production translates to 4bags * 1.8kg or 7.2kg of fruit picked. The weather data was collected from the closets weather station records online as published on wunderground.com

import pandas as pd

# Load the Excel file
rasp25 = pd.read_excel('raspberry2025.xlsx')

# Fill missing values in rasp25 with zeros
rasp25=rasp25.fillna(0)

# Rename column names to pyhthon standard
rasp25.rename(columns={'Bags Picked Today ':'kgs_picked_today', 'Max Tepmrature  ':'max_temp','Average Day Temp ':'avg_temp', 'Min Temprature':'min_temp','Max Wind Speed':'max_wind', 'Date':'date','Frozen Bags Produced':'frozen_bags_produced', 'Total Bags Produced':'total_bags_produced', 'U-Pick & Self Pick Buckets':'upick_farmerpick_buckets', 'Rain Previous 24h':'rain_24h', 'Sun Exposure':'sun', 'Bags Picked Yesterday':'kgs_picked_yesterday'}, inplace=True)

# Aggregate kgs_picked_today : 1 bag => 1.8 kgs
rasp25['kgs_picked_today'] = rasp25['kgs_picked_today'] * 1.8

# Print the columns names to verify
rasp25.columns
# Plot a barchart of bags produced per day
import matplotlib.pyplot as plt 
import seaborn as sns

sns.barplot(data=rasp25, x='date', y='frozen_bags_produced', hue='frozen_bags_produced')
plt.title('Frozen Bags Produced Daily')
plt.xticks(rotation=90)

# Format the x-tick labels
shortform_dates = rasp25['date'].dt.strftime('%b %d')
plt.gca().set_xticklabels(shortform_dates, size=8)
plt.show()
# Plot a barchart of bags produced per day
import matplotlib.pyplot as plt 
import seaborn as sns

sns.barplot(data=rasp25, x='date', y='kgs_picked_today', hue='kgs_picked_today')
plt.title('Daily Harvest kgs')
plt.xticks(rotation=90)

# Format the x-tick labels
shortform_dates = rasp25['date'].dt.strftime('%b %d')
plt.gca().set_xticklabels(shortform_dates, size=8)
plt.show()
# Visualize correlation between production and min temp
# Parse for only picking days to exclude data points of days in which no fruit picking occured
rasp25picking = rasp25[rasp25['kgs_picked_today'] > 0]

sns.lmplot(data=rasp25picking, x='kgs_picked_today', y='min_temp')
plt.show()

# Compute correlation between the two columns
corr = rasp25picking[['kgs_picked_today', 'min_temp']].corr().loc['kgs_picked_today', 'min_temp']
print("Correlation:", corr)
cols = ['max_temp', 'min_temp',
        'avg_temp', 'max_wind']
for col in cols:
    # Shift the column by 1 to get yesterday's value, day before yesterday, 3,4,5 days ago
    rasp25picking[f'{col} Yesterday'] = rasp25picking[col].shift(1)
    rasp25picking[f'{col} 2 Days Ago'] = rasp25picking[col].shift(2)
    rasp25picking[f'{col} 3 Days Ago'] = rasp25picking[col].shift(3)
    rasp25picking[f'{col} 4 Days Ago'] = rasp25picking[col].shift(4)
    rasp25picking[f'{col} 5 Days Ago'] = rasp25picking[col].shift(5)

rasp25picking.columns
# Plot correlations between amount picked to min temps of today, yesterday, 2..3..4 and 5 days ago
times = ['', ' Yesterday', ' 2 Days Ago', ' 3 Days Ago', ' 4 Days Ago', ' 5 Days Ago']  

sns.set_palette("Blues_r")
# Create a dictionary corrs to contain the results of computing correlations
corrs = {}

# Loop thru the times list and compute correlations for each time frame
for time in times:
    col_name = f'min_temp{time}'
    if col_name not in rasp25picking.columns:
        print(f"Column '{col_name}' not found in DataFrame. Skipping.")
        continue
    sns.lmplot(data=rasp25picking, x=col_name, y='kgs_picked_today')
    plt.title(f'Yield Vs {col_name}')
    plt.show()
    corr = rasp25picking[['kgs_picked_today', col_name]].corr().loc['kgs_picked_today', col_name]
    print(f"Correlation: Kgs Picked Today Vs {col_name}", corr.round(2))
    corrs[col_name] = corr.round(2)   

# Convert the dictionary to a DataFrame for display
import pandas as pd
corrs_df = pd.DataFrame.from_dict(corrs, orient='index', columns=['Correlation'])
corrs_df = corrs_df.reset_index().rename(columns={'index': 'min_temp'})

# Create a diverging palette with 6 colors
div_pal = sns.color_palette("Reds", 6)
sns.barplot(data=corrs_df, y='min_temp', palette=div_pal, x='Correlation', hue='Correlation')
plt.ylabel('')
plt.title('Comparing Correlations of Yield To Minimum Temperatures')
plt.show()
# Plot Yield Vs Min Temp 2 days ago
# Create a diverging palette with 6 colors
div_pal = sns.color_palette("Reds", 6)

sns.barplot(data=rasp25picking, x='date', y='kgs_picked_today',palette=div_pal , hue='min_temp 2 Days Ago', width=2)
# Format the x-tick labels
shortform_dates = rasp25['date'].dt.strftime('%b %d')
plt.gca().set_xticklabels(shortform_dates, size=8)
# Rotate x axis labels
plt.xticks(rotation=90)  
plt.show()
# Define a function yield_corr() which takes one argument stat from stats list and computes correlations to yield in all 6 time frames in times list   

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

times = ['', ' Yesterday', ' 2 Days Ago', ' 3 Days Ago', ' 4 Days Ago', ' 5 Days Ago']
stats = ['min_temp', 'avg_temp', 'max_temp', 'max_wind']

def yield_corr(stat):
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt

    # Throw a an error if not in list of available statistics to correlate
    if stat not in stats:
        print(f"These are the possible statistics (arguments) the function takes:")
        {print(stat) for stat in stats}
        return
    corrs = {}
    for time in times:
        col_name = f'{stat}{time}'
        if col_name not in rasp25picking.columns:
            print(f"Column '{col_name}' not found in DataFrame. Skipping.")
            continue
            
### Plot a correlation of yield for each time interval            
###        sns.lmplot(data=rasp25picking, x=col_name, y='Bags Picked Today')
###        plt.title(f'Yield Vs {col_name}')
###        plt.show()                                                    

        # Compute correlations 
        corr = rasp25picking[['kgs_picked_today', col_name]].corr().loc['kgs_picked_today', col_name]
        print(f"Correlation: Kgs Picked Today & {col_name}", corr.round(2))
        
        # Assign results to corrs dataframe                                                              
        corrs[col_name] = corr.round(2) 
        
    # Convert the dictionary to a DataFrame for display                                                                  
    corrs_df = pd.DataFrame.from_dict(corrs, orient='index', columns=['Correlation'])
    corrs_df = corrs_df.reset_index().rename(columns={'index': 'Time Frame'})

    # Create a diverging palette with 6 colors
    div_pal = sns.color_palette("Reds", len(corrs_df))
    sns.barplot(data=corrs_df, y='Time Frame', x='Correlation', palette=div_pal, hue='Correlation')
    plt.ylabel('')
    plt.title(f"Comparing Correlations of Yield To {stat}")
    plt.show()
    return       

yield_corr('min_temp')
yield_corr('avg_temp')
yield_corr('max_temp')
yield_corr('max_wind')