Skip to content

Yield and Fertilizer Treatment

# Load both CSV files and inspect their structure for initial exploration.
import pandas as pd

# Load the datasets
rosas1999_path = 'data/rosas1999.csv'
rosas2001_path = 'data/rosas2001.csv'

rosas1999 = pd.read_csv(rosas1999_path)
rosas2001 = pd.read_csv(rosas2001_path)

# Display basic info about both datasets
rosas1999_info = rosas1999.info()
rosas2001_info = rosas2001.info()

# Display the first few rows of each dataset
rosas1999_head = rosas1999.head()
rosas2001_head = rosas2001.head()

rosas1999_info, rosas2001_info, rosas1999_head, rosas2001_head


# Descriptive statistics for both datasets

rosas1999_description = rosas1999.describe()

rosas2001_description = rosas2001.describe()



# Checking for missing values

rosas1999_missing = rosas1999.isnull().sum()

rosas2001_missing = rosas2001.isnull().sum()



rosas1999_description, rosas1999_missing, rosas2001_description, rosas2001_missing


# Correlation matrices for both datasets to check relationships between variables like YIELD, N, TOPO, and soil properties
rosas1999_corr = rosas1999.corr()
rosas2001_corr = rosas2001.corr()

# Correlations of YIELD with other key variables
yield_corr_1999 = rosas1999_corr["YIELD"].sort_values(ascending=False)
yield_corr_2001 = rosas2001_corr["YIELD"].sort_values(ascending=False)

# Generate visualizations: Histograms for key variables and scatter plots of YIELD vs N for both years

import matplotlib.pyplot as plt

# Plotting histograms for Yield and Nitrogen for both years
plt.figure(figsize=(14, 6))

# 1999 Yield and Nitrogen
plt.subplot(1, 2, 1)
rosas1999[['YIELD', 'N']].hist(alpha=0.5, bins=30)
plt.title('1999: Yield and Nitrogen Distribution')

# 2001 Yield and Nitrogen
plt.subplot(1, 2, 2)
rosas2001[['YIELD', 'N']].hist(alpha=0.5, bins=30)
plt.title('2001: Yield and Nitrogen Distribution')

plt.tight_layout()
plt.show()

# Scatter plot for Yield vs Nitrogen for 1999 and 2001
plt.figure(figsize=(14, 6))

# 1999 Scatter
plt.subplot(1, 2, 1)
plt.scatter(rosas1999['N'], rosas1999['YIELD'], alpha=0.5)
plt.title('1999: Yield vs Nitrogen')
plt.xlabel('Nitrogen (kg/ha)')
plt.ylabel('Yield (quintals/ha)')

# 2001 Scatter
plt.subplot(1, 2, 2)
plt.scatter(rosas2001['N'], rosas2001['YIELD'], alpha=0.5)
plt.title('2001: Yield vs Nitrogen')
plt.xlabel('Nitrogen (kg/ha)')
plt.ylabel('Yield (quintals/ha)')

plt.tight_layout()
plt.show()

yield_corr_1999, yield_corr_2001
# Analyze the yield distribution across topographic zones (TOP2, TOP3, TOP4) for 1999 and 2001.

import seaborn as sns
import matplotlib.pyplot as plt

# Create a new column in both datasets to represent the topography more clearly
rosas1999['TOPO_CAT'] = rosas1999['TOP2'] * 2 + rosas1999['TOP3'] * 3 + rosas1999['TOP4'] * 4
rosas2001['TOPO_CAT'] = rosas2001['TOP2'] * 2 + rosas2001['TOP3'] * 3 + rosas2001['TOP4'] * 4

# Visualize the yield distribution by topography zones (1999 and 2001)
plt.figure(figsize=(14, 6))

# 1999 Yield vs Topography
plt.subplot(1, 2, 1)
sns.boxplot(x='TOPO_CAT', y='YIELD', data=rosas1999)
plt.title('1999: Yield Distribution by Topography')
plt.xlabel('Topography (TOPO_CAT)')
plt.ylabel('Yield (quintals/ha)')

# 2001 Yield vs Topography
plt.subplot(1, 2, 2)
sns.boxplot(x='TOPO_CAT', y='YIELD', data=rosas2001)
plt.title('2001: Yield Distribution by Topography')
plt.xlabel('Topography (TOPO_CAT)')
plt.ylabel('Yield (quintals/ha)')

plt.tight_layout()
plt.show()

# Visualizing the impact of brightness (BV) on yield for both 1999 and 2001
plt.figure(figsize=(14, 6))

# 1999 Yield vs Brightness
plt.subplot(1, 2, 1)
sns.scatterplot(x='BV', y='YIELD', data=rosas1999, alpha=0.5)
plt.title('1999: Yield vs Brightness')
plt.xlabel('Brightness (BV)')
plt.ylabel('Yield (quintals/ha)')

# 2001 Yield vs Brightness
plt.subplot(1, 2, 2)
sns.scatterplot(x='BV', y='YIELD', data=rosas2001, alpha=0.5)
plt.title('2001: Yield vs Brightness')
plt.xlabel('Brightness (BV)')
plt.ylabel('Yield (quintals/ha)')

plt.tight_layout()
plt.show()

# Check the spatial distribution of yields (Latitude/Longitude)
plt.figure(figsize=(14, 6))

# 1999 Yield vs Latitude
plt.subplot(1, 2, 1)
sns.scatterplot(x='LATITUDE', y='YIELD', data=rosas1999, alpha=0.5)
plt.title('1999: Yield vs Latitude')
plt.xlabel('Latitude')
plt.ylabel('Yield (quintals/ha)')

# 2001 Yield vs Latitude
plt.subplot(1, 2, 2)
sns.scatterplot(x='LATITUDE', y='YIELD', data=rosas2001, alpha=0.5)
plt.title('2001: Yield vs Latitude')
plt.xlabel('Latitude')
plt.ylabel('Yield (quintals/ha)')

plt.tight_layout()
plt.show()

Hypothesis Setup:

Null Hypothesis (H₀): Nitrogen application has no effect on yield (i.e., the coefficient for nitrogen in the regression model is zero). Alternative Hypothesis (H₁): Nitrogen application has a significant effect on yield (i.e., the coefficient for nitrogen is different from zero).
import statsmodels.api as sm

# Prepare the independent variable (Nitrogen) and the dependent variable (Yield) for 1999 and 2001
X_1999 = sm.add_constant(rosas1999['N'])  # Add constant term for the regression intercept
y_1999 = rosas1999['YIELD']

X_2001 = sm.add_constant(rosas2001['N'])
y_2001 = rosas2001['YIELD']

# Run OLS regression (Ordinary Least Squares) for 1999
model_1999 = sm.OLS(y_1999, X_1999).fit()

# Run OLS regression for 2001
model_2001 = sm.OLS(y_2001, X_2001).fit()

# Summary of the models
summary_1999 = model_1999.summary()
summary_2001 = model_2001.summary()

summary_1999, summary_2001

Hypothesis Setup:

Null Hypothesis (H₀): Topography has no effect on yield (i.e., the coefficients for topographic zones are zero). Alternative Hypothesis (H₁): Topography has a significant effect on yield (i.e., the coefficients for topographic zones are different from zero).
import statsmodels.api as sm
import pandas as pd

# Assuming the data is in CSV files, we need to load the data first
rosas1999 = pd.read_csv('data/rosas1999.csv')
rosas2001 = pd.read_csv('data/rosas2001.csv')

# Prepare the independent variables (Topography: TOP2, TOP3, TOP4) and the dependent variable (Yield) for 1999 and 2001
X_1999_topo = rosas1999[['TOP2', 'TOP3', 'TOP4']]
X_1999_topo = sm.add_constant(X_1999_topo)  # Add constant term for the regression intercept
y_1999 = rosas1999['YIELD']

X_2001_topo = rosas2001[['TOP2', 'TOP3', 'TOP4']]
X_2001_topo = sm.add_constant(X_2001_topo)
y_2001 = rosas2001['YIELD']

# Run OLS regression (Ordinary Least Squares) for topography in 1999
model_1999_topo = sm.OLS(y_1999, X_1999_topo).fit()

# Run OLS regression for topography in 2001
model_2001_topo = sm.OLS(y_2001, X_2001_topo).fit()

# Summary of the models
summary_1999_topo = model_1999_topo.summary()
summary_2001_topo = model_2001_topo.summary()

summary_1999_topo, summary_2001_topo