Skip to content
Simple IOHMM with Dummy data
import pandas as pd
import numpy as np
# Set random seed for reproducibility
np.random.seed(42)
# Define the number of startups and the number of quarters
num_startups = 100
num_quarters = 20
# Generate dummy startup quarterly CAGRs
startup_growth_rates = np.random.uniform(-0.1, 0.3, size=(num_startups, num_quarters))
cagrs = np.prod(1 + startup_growth_rates, axis=1) ** (1 / num_quarters) - 1
# Generate dummy board diversity scores for each startup (ranging from 1 to 5)
board_diversity_scores = np.random.randint(1, 6, size=num_startups)
# Generate dummy ESG compliance scores for each startup (ranging from 1 to 5)
esg_compliance_scores = np.random.randint(1, 6, size=num_startups)
# Create a DataFrame to store the dummy startup data
startup_data = pd.DataFrame({
'Startup_ID': range(1, num_startups + 1),
'CAGR': cagrs,
'Board Diversity Score': board_diversity_scores,
'ESG Compliance Score': esg_compliance_scores
})
# Define the bins for CAGR categories
cagr_bins = [-np.inf, 0, 0.05, 0.1, np.inf] # Example bins: [0%, 0-5%, 5-10%, >10%]
# Define the labels for CAGR categories
cagr_labels = ['Decline', 'Low Growth', 'Moderate Growth', 'High Growth']
# Bin the CAGR values into categories
startup_data['CAGR Category'] = pd.cut(startup_data['CAGR'], bins=cagr_bins, labels=cagr_labels)
# Save the DataFrame to a CSV file
startup_data.to_csv('dummy_startup_data_with_cagr.csv', index=False)
print("Dummy startup data with CAGR categories saved to 'dummy_startup_data_with_cagr.csv'.")
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hmmlearn import hmm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Define a function to fit HMM with different numbers of hidden states and compute AIC/BIC
def fit_hmm_with_criteria(X, max_states=10):
aic_values = []
bic_values = []
models = []
for n_states in range(1, max_states + 1):
model = hmm.MultinomialHMM(n_components=n_states)
model.fit(X)
log_likelihood = model.score(X)
n_params = n_states * (n_states - 1) + 2 * n_states * len(X[0]) - 1
aic = -2 * log_likelihood + 2 * n_params
bic = -2 * log_likelihood + np.log(len(X)) * n_params
aic_values.append(aic)
bic_values.append(bic)
models.append(model)
return aic_values, bic_values, models
# Load the dataset
data = pd.read_csv("data.csv")
# Split the data into features and target variable
X = data.drop('CAGR', axis=1) # Features
y = data['CAGR'] # Target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
max_states = 10 # Maximum number of hidden states to consider
aic_values, bic_values, models = fit_hmm_with_criteria(X_train.values, max_states) # Use .values to convert DataFrame to numpy array
# Find the number of hidden states that minimizes AIC/BIC
best_num_states_aic = np.argmin(aic_values) + 1 # Add 1 because of zero-based indexing
best_num_states_bic = np.argmin(bic_values) + 1
print("Best number of states based on AIC:", best_num_states_aic)
print("Best number of states based on BIC:", best_num_states_bic)
# Plot AIC and BIC values
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_states + 1), aic_values, marker='o', label='AIC')
plt.plot(range(1, max_states + 1), bic_values, marker='o', label='BIC')
plt.xlabel('Number of Hidden States')
plt.ylabel('Information Criterion Value')
plt.title('AIC and BIC for Different Numbers of Hidden States')
plt.legend()
plt.grid(True)
plt.show()# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from hmmlearn import hmm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Load the dataset
data = pd.read_csv("data.csv")
# Split the data into features and target variable
X = data.drop(['CAGR'], axis=1) # Features
y = data['CAGR'] # Target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the IOHMM model
n_states = 2 # Number of hidden states
iohmm_model = hmm.MultinomialHMM(n_components=n_states)
# Fit the model to the training data
iohmm_model.fit(X_train)
# Make predictions on the test data
y_pred = iohmm_model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Visualize the most important features
state_0_probs = np.mean(X_train[iohmm_model.predict(X_train) == 0], axis=0)
state_1_probs = np.mean(X_train[iohmm_model.predict(X_train) == 1], axis=0)
plt.figure(figsize=(12, 8))
# Plotting state 0
plt.subplot(2, 1, 1)
plt.bar(np.arange(len(state_0_probs)), state_0_probs, color='blue', alpha=0.7, label='State 0')
plt.title('Average Probability of Metrics in Each Hidden State')
plt.ylabel('Probability')
plt.xticks(np.arange(len(state_0_probs)), X.columns, rotation=45)
plt.legend()
# Plotting state 1
plt.subplot(2, 1, 2)
plt.bar(np.arange(len(state_1_probs)), state_1_probs, color='orange', alpha=0.7, label='State 1')
plt.xlabel('Metrics')
plt.ylabel('Probability')
plt.xticks(np.arange(len(state_1_probs)), X.columns, rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
# Scatter Plot Matrix (Pairplot)
df_pairplot = pd.concat([X_train, y_train], axis=1)
sns.pairplot(df_pairplot)
plt.suptitle('Pairplot of Features vs. Future CAGR', y=1.02)
plt.show()
# Boxplots
plt.figure(figsize=(12, 8))
for i, col in enumerate(X_train.columns):
plt.subplot(3, 3, i+1)
sns.boxplot(x=X_train[col], y=y_train)
plt.title(f'Boxplot of {col} vs. Future CAGR')
plt.xlabel(col)
plt.ylabel('Future CAGR')
plt.tight_layout()
plt.show()
# ESG Compliance Score Distribution (Histogram)
esg_compliance_scores = data['ESG Compliance Score']
plt.figure(figsize=(8, 6))
plt.hist(esg_compliance_scores, bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of ESG Compliance Scores')
plt.xlabel('ESG Compliance Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# Demographic Diversity Radar Chart
def create_radar_chart(values, company_name):
demographic_aspects = ['Gender Diversity', 'Ethnic Diversity', 'Age Diversity', 'etc.'] # Define demographic aspects
num_vars = len(demographic_aspects)
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
values += values[:1]
angles += angles[:1]
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
ax.fill(angles, values, color='skyblue', alpha=0.5)
ax.plot(angles, values, color='blue', linewidth=1, linestyle='solid')
plt.title(f'Demographic Diversity Profile of {company_name}', size=20, color='blue', y=1.1)
plt.show()
# Example values for radar chart
values_company_1 = [0.8, 0.7, 0.6, 0.9] # Example values for company 1
values_company_2 = [0.6, 0.5, 0.4, 0.8] # Example values for company 2
create_radar_chart(values_company_1, 'Company 1')
create_radar_chart(values_company_2, 'Company 2')
# Industry-wise CAGR Comparison (Bar Chart)
plt.figure(figsize=(10, 6))
sns.barplot(x='Industry', y='Predicted CAGR', data=df_cagr_comparison)
plt.title('Industry-wise CAGR Comparison')
plt.xlabel('Industry')
plt.ylabel('Predicted CAGR')
plt.xticks(rotation=45)
plt.show()
# Business Model Category Analysis (Pie Chart)
plt.figure(figsize=(8, 6))
data['Business_Model_Category'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Business Model Categories')
plt.ylabel('')
plt.show()Run cancelled
!pip install hmmlearn
from hmmlearn import hmm