Skip to content
import pandas as pd

# Create the dummy data
data = {
    'Farmer_ID': [1, 2, 3, 4, 5],
    'Age': [27, 24, 29, 22, 30],
    'Education': ['Secondary', 'None', 'Tertiary', 'Primary', 'Secondary'],
    'Experience_Years': [3, 1, 5, 2, 7],
    'Crop_Type': ['Maize', 'Tomato', 'Soybean', 'Onion', 'Wheat'],
    'Region': ['West Zone', 'East Zone', 'North Zone', 'South Zone', 'West Zone'],
    'Land_Size_Acres': [3.5, 1.2, 5.0, 2.0, 6.0],
    'Annual_Revenue_USD': [6000, 2000, 10000, 2500, 12000],
    'Loan_Amount_USD': [3000, 1000, 5000, 1500, 6000],
    'Loan_Approved': ['Yes', 'No', 'Yes', 'Yes', 'Yes'],
    'Repaid_On_Time': ['Yes', 'No', 'Yes', 'No', 'Yes'],
    'Past_Defaults': [0, 1, 0, 1, 0],
    'Weather_Risk_Index': [0.2, 0.6, 0.1, 0.5, 0.3],
    'Market_Stability_Score': [0.8, 0.4, 0.9, 0.6, 0.7]
}

# Convert to DataFrame and save
df = pd.DataFrame(data)
df.to_csv("farmer_data.csv", index=False)
import pandas as pd

# Load the data
df = pd.read_csv("farmer_data.csv")

# 1. Show structure and types
print("\n--- Dataset Info ---")
print(df.info())

# 2. Statistical summary of numeric columns
print("\n--- Summary Statistics (Numerical) ---")
print(df.describe())

# 3. Summary of categorical columns
categorical_cols = df.select_dtypes(include='object').columns

print("\n--- Summary of Categorical Variables ---")
for col in categorical_cols:
    print(f"\n{col} Value Counts:")
    print(df[col].value_counts())
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('farmer_data.csv')

# Set up style
sns.set(style='whitegrid')
plt.figure(figsize=(10, 6))

# 1. Revenue vs. Loan Amount
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Annual_Revenue_USD', y='Loan_Amount_USD', hue='Loan_Approved', style='Repaid_On_Time', data=df)
plt.title('Loan Amount vs Annual Revenue')
plt.xlabel('Annual Revenue (USD)')
plt.ylabel('Loan Amount (USD)')
plt.legend(title='Loan Status')
plt.tight_layout()
plt.show()

# 2. Loan Repayment by Education Level
plt.figure(figsize=(7, 4))
sns.countplot(x='Education', hue='Repaid_On_Time', data=df)
plt.title('Loan Repayment by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Number of Farmers')
plt.legend(title='Loan Repaid')
plt.tight_layout()
plt.show()

# 3. Market Stability vs Weather Risk (with repayment status)
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Market_Stability_Score', y='Weather_Risk_Index', hue='Repaid_On_Time', data=df, s=100)
plt.title('Weather Risk vs Market Stability')
plt.xlabel('Market Stability')
plt.ylabel('Weather Risk Index')
plt.tight_layout()
plt.show()

# 4. Loan Repaid by Crop Type
plt.figure(figsize=(8, 4))
sns.countplot(y='Crop_Type', hue='Repaid_On_Time', data=df)
plt.title('Loan Repayment by Crop Type')
plt.xlabel('Number of Farmers')
plt.ylabel('Crop Type')
plt.tight_layout()
plt.show()

# 5. Repayment vs Experience
plt.figure(figsize=(8, 5))
sns.boxplot(x='Repaid_On_Time', y='Experience_Years', data=df)
plt.title('Experience Distribution by Loan Repayment')
plt.xlabel('Loan Repaid')
plt.ylabel('Years of Experience')
plt.tight_layout()
plt.show()
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load data
df = pd.read_csv('farmer_data.csv')

# Step 2: Convert target column to binary (Yes → 1, No → 0)
df['Repaid_On_Time'] = df['Repaid_On_Time'].map({'Yes': 1, 'No': 0})

# Step 3: Encode categorical variables
categorical_cols = ['Education', 'Crop_Type', 'Region', 'Loan_Approved']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Step 4: Define features (X) and target (y)
X = df_encoded.drop(['Farmer_ID', 'Repaid_On_Time'], axis=1)
y = df_encoded['Repaid_On_Time']

# Step 5: Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 7: Evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 8: Feature importance plot
importances = model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=True)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('Feature Importance for Predicting Loan Repayment')
plt.tight_layout()
plt.show()
# Create a new applicant using the correct feature names
new_farmer = pd.DataFrame([{
    'Age': 26,
    'Education': 'Secondary',
    'Experience_Years': 3,
    'Crop_Type': 'Maize',
    'Region': 'East Zone',
    'Land_Size_Acres': 2.5,
    'Annual_Revenue_USD': 5500,
    'Loan_Amount_USD': 2500,
    'Loan_Approved': 'Yes',
    'Past_Defaults': 0,
    'Weather_Risk_Index': 0.2,
    'Market_Stability_Score': 0.75
}])

# Encode categorical features to match training set
new_encoded = pd.get_dummies(new_farmer)
missing_cols = set(X.columns) - set(new_encoded.columns)
for col in missing_cols:
    new_encoded[col] = 0  # Add missing columns as 0

# Ensure column order matches training set
new_encoded = new_encoded[X.columns]

# Predict repayment likelihood
predicted = model.predict(new_encoded)
print(f"Loan Repayment Prediction: {'Yes' if predicted[0] == 1 else 'No'}")
pip install streamlit
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Example farmer data
data = {
    'Farmer_ID': [1, 2, 3, 4, 5],
    'Age': [27, 24, 29, 22, 30],
    'Education': ['Secondary', 'None', 'Tertiary', 'Primary', 'Secondary'],
    'Experience_Years': [3, 1, 5, 2, 7],
    'Crop_Type': ['Maize', 'Tomato', 'Soybean', 'Onion', 'Wheat'],
    'Region': ['West Zone', 'East Zone', 'North Zone', 'South Zone', 'West Zone'],
    'Land_Size_Acres': [3.5, 1.2, 5.0, 2.0, 6.0],
    'Annual_Revenue_USD': [6000, 2000, 10000, 2500, 12000],
    'Loan_Amount_USD': [3000, 1000, 5000, 1500, 6000],
    'Loan_Approved': ['Yes', 'No', 'Yes', 'Yes', 'Yes'],
    'Repaid_On_Time': ['Yes', 'No', 'Yes', 'No', 'Yes'],
    'Past_Defaults': [0, 1, 0, 1, 0],
    'Weather_Risk_Index': [0.2, 0.6, 0.1, 0.5, 0.3],
    'Market_Stability_Score': [0.8, 0.4, 0.9, 0.6, 0.7]
}

# Create DataFrame
df = pd.DataFrame(data)

# Encode categorical columns
label_encoder = LabelEncoder()
df['Education'] = label_encoder.fit_transform(df['Education'])
df['Crop_Type'] = label_encoder.fit_transform(df['Crop_Type'])
df['Region'] = label_encoder.fit_transform(df['Region'])
df['Loan_Approved'] = df['Loan_Approved'].map({'Yes': 1, 'No': 0})
df['Repaid_On_Time'] = df['Repaid_On_Time'].map({'Yes': 1, 'No': 0})

# Features and target
X = df.drop(['Farmer_ID', 'Repaid_On_Time'], axis=1)
y = df['Repaid_On_Time']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model to a .pkl file
with open('loan_repayment_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model trained and saved successfully.")
pip install streamlit
import streamlit as st
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Dummy data for the farmer dataset
data = {
    'Farmer_ID': [1, 2, 3, 4, 5],
    'Age': [27, 24, 29, 22, 30],
    'Education': ['Secondary', 'None', 'Tertiary', 'Primary', 'Secondary'],
    'Experience_Years': [3, 1, 5, 2, 7],
    'Crop_Type': ['Maize', 'Tomato', 'Soybean', 'Onion', 'Wheat'],
    'Region': ['West Zone', 'East Zone', 'North Zone', 'South Zone', 'West Zone'],
    'Land_Size_Acres': [3.5, 1.2, 5.0, 2.0, 6.0],
    'Annual_Revenue_USD': [6000, 2000, 10000, 2500, 12000],
    'Loan_Amount_USD': [3000, 1000, 5000, 1500, 6000],
    'Loan_Approved': ['Yes', 'No', 'Yes', 'Yes', 'Yes'],
    'Repaid_On_Time': ['Yes', 'No', 'Yes', 'No', 'Yes'],
    'Past_Defaults': [0, 1, 0, 1, 0],
    'Weather_Risk_Index': [0.2, 0.6, 0.1, 0.5, 0.3],
    'Market_Stability_Score': [0.8, 0.4, 0.9, 0.6, 0.7]
}

# Create DataFrame
df = pd.DataFrame(data)

# Encode categorical columns
label_encoder = LabelEncoder()
df['Education'] = label_encoder.fit_transform(df['Education'])
df['Crop_Type'] = label_encoder.fit_transform(df['Crop_Type'])
df['Region'] = label_encoder.fit_transform(df['Region'])
df['Loan_Approved'] = df['Loan_Approved'].map({'Yes': 1, 'No': 0})
df['Repaid_On_Time'] = df['Repaid_On_Time'].map({'Yes': 1, 'No': 0})

# Features and target
X = df.drop(['Farmer_ID', 'Repaid_On_Time'], axis=1)
y = df['Repaid_On_Time']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model to a .pkl file
with open('loan_repayment_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Streamlit UI
st.title('Farmer Loan Repayment Prediction')
st.write("Please enter the details of the new farmer to predict loan repayment likelihood.")

# Input fields for the farmer data
age = st.number_input('Age', min_value=18, max_value=100, value=26)
education = st.selectbox('Education Level', ['None', 'Primary', 'Secondary', 'Tertiary'])
experience_years = st.number_input('Years of Experience', min_value=0, max_value=50, value=3)
crop_type = st.selectbox('Crop Type', ['Maize', 'Tomato', 'Soybean', 'Onion', 'Wheat'])
region = st.selectbox('Region', ['West Zone', 'East Zone', 'North Zone', 'South Zone'])
land_size = st.number_input('Land Size (Acres)', min_value=0.1, max_value=100.0, value=2.5)
annual_revenue = st.number_input('Annual Revenue (USD)', min_value=0, max_value=100000, value=5500)
loan_amount = st.number_input('Loan Amount (USD)', min_value=0, max_value=50000, value=2500)
loan_approved = st.selectbox('Loan Approved', ['Yes', 'No'])
past_defaults = st.selectbox('Past Defaults', [0, 1])
weather_risk = st.slider('Weather Risk Index', 0.0, 1.0, 0.2)
market_stability = st.slider('Market Stability Score', 0.0, 1.0, 0.75)

# Create DataFrame for the new farmer's input
new_farmer = pd.DataFrame([{
    'Age': age,
    'Education': education,
    'Experience_Years': experience_years,
    'Crop_Type': crop_type,
    'Region': region,
    'Land_Size_Acres': land_size,
    'Annual_Revenue_USD': annual_revenue,
    'Loan_Amount_USD': loan_amount,
    'Loan_Approved': loan_approved,
    'Past_Defaults': past_defaults,
    'Weather_Risk_Index': weather_risk,
    'Market_Stability_Score': market_stability
}])

# Process input data (encode categorical variables)
new_encoded = pd.get_dummies(new_farmer)
missing_cols = set(X.columns) - set(new_encoded.columns)
for col in missing_cols:
    new_encoded[col] = 0  # Add missing columns as 0

# Ensure column order matches training set
new_encoded = new_encoded[X.columns]

# Predict repayment likelihood
predicted = model.predict(new_encoded)

# Display prediction result
if predicted[0] == 1:
    st.success("Loan Repayment Prediction: **Yes**, the farmer is likely to repay the loan on time.")
else:
    st.error("Loan Repayment Prediction: **No**, the farmer is not likely to repay the loan on time.")