DA-07-10-25-lesson12

!pip install kaggle

!pip install kagglehub

import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Завантаження датасету
print("🔄 Завантаження датасету...")
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")
print(f"✓ Шлях до датасету: {path}")

# Об'єднання шляху та імені файлу
file_name = "heart.csv"
csv_file = Path(path) / file_name

# Альтернативний пошук, якщо файл має іншу назву
if not csv_file.exists():
    print(f"⚠️  Файл '{file_name}' не знайдено, шукаємо інші CSV файли...")
    csv_files = list(Path(path).rglob("*.csv"))
    if csv_files:
        csv_file = csv_files[0]
        print(f"✓ Знайдено файл: {csv_file.name}")
    else:
        raise FileNotFoundError(f"CSV файли не знайдено в {path}")

print(f"📂 Повний шлях: {csv_file}")

# Завантаження даних
try:
    heart_disease_df = pd.read_csv(csv_file)
    print(f"✓ Дані успішно завантажено")
    print(f"📊 Розмір: {heart_disease_df.shape[0]:,} рядків × {heart_disease_df.shape[1]} стовпців")
except Exception as e:
    print(f"❌ Помилка читання файлу: {e}")


# Print the first 5 rows of the DataFrame
print(heart_disease_df.head())

# Print information about the DataFrame
print(heart_disease_df.info())

# Visualize the cholesterol column
heart_disease_df['chol'].plot(kind='hist')

# Set the title and axis labels
plt.title('Cholesterol distribution')
plt.xlabel('Cholesterol')
plt.ylabel('Frequency')
plt.show()

# Print the sex value counts of the heart disease dataset
print(heart_disease_df['sex'].value_counts())

# Drop empty columns
heart_disease_column_dropped = heart_disease_df.drop(['oldpeak'], axis=1)

# Drop duplicate rows
heart_disease_duplicates_dropped = heart_disease_column_dropped.drop_duplicates()

# Calculate the mean value of the restecg column
mean_value = heart_disease_duplicates_dropped['restecg'].mean()

# Impute missing values with the mean
heart_disease_duplicates_dropped['restecg'].fillna(mean_value, inplace=True)
print(heart_disease_duplicates_dropped['restecg'].isna().any())



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer# Split the data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

X_train, X_test = train_test_split(heart_disease_duplicates_dropped, test_size=0.2, random_state=42)# Createnormalizer object, fit on training data, normalize, and transform test set
norm = Normalizer()
X_train_norm = norm.fit_transform(X_train)
X_test_norm = norm.transform(X_test)



# Show the initial distribution of 'age'
age = X_train['age']
plt.figure(figsize=(10,5))
plt.hist(age, bins=30, alpha=0.5, label='Original')
plt.legend(prop={'size': 16})
plt.title('Histogram with Original Age'); 
plt.xlabel('Age'); plt.ylabel('Count');
plt.show()



# Normalize 'age' on the training set.
normalizer = MinMaxScaler()
X_train['age'] = normalizer.fit_transform(X_train['age'].values.reshape(-1,1))

# Use the same normalizer to transform the 'age' column of the test set to avoid data leakage
X_test['age'] = normalizer.transform(X_test['age'].values.reshape(-1,1))

plt.figure(figsize=(10,5))
plt.hist(X_test['age'], bins=30, alpha=0.5, label='Normalized')
plt.legend(prop={'size': 16})
plt.title('Histogram with Normalized Age')
plt.xlabel('Normalized Age')
plt.ylabel('Count')
plt.show()


# Standardize 'age' on the training set and use the same standardizer to transform the 'age' column of the test set to avoid data leakage
standardizer = StandardScaler()
X_train['age'] = standardizer.fit_transform(X_train['age'].values.reshape(-1,1))
X_test['age'] = standardizer.transform(X_test['age'].values.reshape(-1,1))

plt.figure(figsize=(10,5))
plt.hist(X_train['age'], bins=30, alpha=0.5, label='Standardized')
plt.legend(prop={'size': 16})
plt.title('Histogram with Standardized Age')
plt.xlabel('Standardized Age')
plt.ylabel('Count')
plt.show()



from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

# Splitting data into train and test subsets first to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(heart_disease_df_X, heart_disease_df_y, test_size=0.2, random_state=42)



# Define the random forest model and fit to the training data
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
rf.fit(X_train, y_train)

# Define the feature selection object
model = SelectFromModel(rf, prefit=True)

# Transform the training features
X_train_transformed = model.transform(X_train)

original_features = heart_disease_df.columns[:-1]
print(f"Original features: {original_features}")

# Select the features deemed important by the SelectFromModel
features_bool = model.get_support()

selected_features = original_features[features_bool]
print(f"\nSelected features: {selected_features}")

feature_importance = pd.DataFrame({
    "feature": selected_features,
    "importance": rf.feature_importances_[features_bool]
})
plt.figure(figsize=(10, 6))
plt.barh(feature_importance["feature"], feature_importance["importance"])
plt.show()




# Import required modules
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(heart_disease_X, heart_disease_y, test_size=0.2, random_state=42)

# Define the SVM / SVC model
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

# Get predictions from the model
y_pred = svc_model.predict(X_test)
print(y_pred)





# Initialize the MLflow experiment
mlflow.set_experiment("Logistic Regression Heart Disease Prediction")

# Start a run, log model coefficients and intercept
with mlflow.start_run():
    for idx, coef in enumerate(model.coef_[0]):
        mlflow.log_param(f"coef_{idx}", coef)
    mlflow.log_param("intercept", model.intercept_[0])

    run_id = mlflow.active_run().info.run_id
    print(run_id)






# Create a KFold object
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Get the train and test data from the first split from the shuffled KFold
train_data_split, test_data_split = next(kfold.split(heart_disease_df_X))

# Print out the number of datapoints in the original training set, as well as the train and test splits
print("Number of training datapoints in heart_disease_df_X:", len(heart_disease_df_X))
print("Number of training datapoints in split:", len(train_data_split))
print("Number of testing datapoints in split:", len(test_data_split))



# Evaluate model using k-fold cross-validation
kf = KFold(n_splits=5)

# Compute the cross-validation score
score = cross_val_score(model, heart_disease_df_X, heart_disease_df_y, scoring='balanced_accuracy', cv=kf)
print(score)

# Get model predictions
y_pred = model.predict(heart_disease_df_X)

# Print confusion matrix
cm = confusion_matrix(heart_disease_df_y, y_pred)
print(cm)

import unittest
import numpy as np

# Create a class called TestModelInference
class TestModelInference(unittest.TestCase):
	def setUp(self):
		self.model = model

		# set X_test as a class attribute
		self.X_test = X_test

	# define a test for prediction output values
	def test_prediction_output_values(self):
		print("Running test_prediction_output_values test case")

		# Get model predictions
		y_pred = self.model.predict(self.X_test)
		unique_values = np.unique(y_pred)
		for value in unique_values:
			self.assertIn(value, [0, 1])

# Define entity and selected features
patient = Entity(name="patient", join_keys=["patient_id"])
cp = Field(name="cp", dtype=Float32)
thalach = Field(name="thalach", dtype=Int32)
ca = Field(name="ca", dtype=Int32)
thal = Field(name="thal", dtype=Int32)

heart_disease_df.to_parquet("heart_disease.parquet")

# Point File Source to the saved file
data_source = FileSource(
    path="heart_disease.parquet",
    event_timestamp_column="timestamp",
    created_timestamp_column="created",
)

# Create a Feature View of the features
heart_disease_fv = FeatureView(
    name="heart_disease",
    entities=[patient],
    schema=[cp, thalach, ca, thal],
    source=data_source,
)

# Create a store of the data and apply the features
store = FeatureStore(repo_path=".")
store.apply([patient, heart_disease_fv])

import logging
import matplotlib.pyplot as plt# Setting up basic logging configuration
logging.basicConfig(filename='predictions.log', level=logging.INFO)# Make predictions on the test set and log the results
for i in range(X_test.shape[0]): 
   instance = X_test[i,:].reshape(1, -1)
   prediction = model.predict(instance)
   logging.info(f'Inst. {i} - PredClass: {prediction[0]}, RealClass: {y_test[i]}')

fig, ax = plt.subplots(1, 2, figsize=(15, 6))  # 1 row, 2 columns
# January Plot
logs_january['target'].value_counts().plot(kind='bar', ax=ax[0])
ax[0].set_title('Distribution of Predicted Classes - January')
ax[0].set_xlabel('Class')
ax[0].set_ylabel('Frequency')

# February Plot
logs_february['target'].value_counts().plot(kind='bar', ax=ax[1])
ax[1].set_title('Distribution of Predicted Classes - February')
ax[1].set_xlabel('Class')
ax[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import warnings

warnings.filterwarnings('ignore')

# Налаштування для підтримки української мови на графіках
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False

# ==================== ЗАВАНТАЖЕННЯ ДАНИХ ====================
print("🔄 Завантаження датасету...")
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")
print(f"✓ Шлях до датасету: {path}")

# Об'єднання шляху та імені файлу
file_name = "heart.csv"
csv_file = Path(path) / file_name

# Альтернативний пошук, якщо файл має іншу назву
if not csv_file.exists():
    print(f"⚠️  Файл '{file_name}' не знайдено, шукаємо інші CSV файли...")
    csv_files = list(Path(path).rglob("*.csv"))
    if csv_files:
        csv_file = csv_files[0]
        print(f"✓ Знайдено файл: {csv_file.name}")
    else:
        raise FileNotFoundError(f"CSV файли не знайдено в {path}")

print(f"📂 Повний шлях: {csv_file}")

# Завантаження даних
try:
    heart_disease_df = pd.read_csv(csv_file)
    print(f"✓ Дані успішно завантажено")
    print(f"📊 Розмір: {heart_disease_df.shape[0]:,} рядків × {heart_disease_df.shape[1]} стовпців")
except Exception as e:
    print(f"❌ Помилка читання файлу: {e}")
    exit()

# Українські назви ознак
feature_names_uk = {
    'age': 'Вік',
    'sex': 'Стать',
    'cp': 'Тип болю в грудях',
    'trestbps': 'Артеріальний тиск',
    'chol': 'Холестерин',
    'fbs': 'Цукор натще',
    'restecg': 'ЕКГ спокою',
    'thalach': 'Макс. частота серцебиття',
    'exang': 'Стенокардія при навантаженні',
    'oldpeak': 'Депресія ST',
    'slope': 'Нахил сегмента ST',
    'ca': 'Кількість судин',
    'thal': 'Таласемія',
    'target': 'Хвороба серця'
}

# ==================== БАЗОВИЙ ОГЛЯД ДАНИХ ====================
print("\n" + "="*70)
print("БАЗОВИЙ ОГЛЯД ДАНИХ")
print("="*70)

# Перші 5 рядків
print("\n📊 Перші 5 рядків:")
print(heart_disease_df.head())

# Інформація про DataFrame
print("\n📋 Інформація про DataFrame:")
print(heart_disease_df.info())

# ==================== ВІЗУАЛІЗАЦІЯ ХОЛЕСТЕРИНУ ====================
print("\n" + "="*70)
print("ВІЗУАЛІЗАЦІЯ РОЗПОДІЛУ ХОЛЕСТЕРИНУ")
print("="*70)

plt.figure(figsize=(10, 6))
heart_disease_df['chol'].plot(kind='hist', bins=30, edgecolor='black', 
                              alpha=0.7, color='steelblue')
plt.title('Розподіл рівня холестерину в крові', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Рівень холестерину (мг/дл)', fontsize=13)
plt.ylabel('Частота (кількість пацієнтів)', fontsize=13)
plt.grid(alpha=0.3, linestyle='--')
plt.tight_layout()
plt.savefig('01_cholesterol_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# ==================== АНАЛІЗ СТАТІ ====================
print("\n" + "="*70)
print("РОЗПОДІЛ ЗА СТАТТЮ")
print("="*70)
print(heart_disease_df['sex'].value_counts())

# Візуалізація розподілу за статтю
plt.figure(figsize=(10, 6))
sex_counts = heart_disease_df['sex'].value_counts()
colors = ['#3498db', '#e74c3c']
bars = plt.bar(['Жінки (0)', 'Чоловіки (1)'], sex_counts.values, 
               color=colors, edgecolor='black', linewidth=1.5)
plt.title('Розподіл пацієнтів за статтю', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Стать', fontsize=13)
plt.ylabel('Кількість пацієнтів', fontsize=13)
plt.grid(axis='y', alpha=0.3, linestyle='--')

# Додавання значень на стовпчики
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}', ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('02_sex_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# ==================== ОЧИЩЕННЯ ДАНИХ ====================
print("\n" + "="*70)
print("ОЧИЩЕННЯ ДАНИХ")
print("="*70)

# Перевірка наявності стовпця 'oldpeak'
if 'oldpeak' in heart_disease_df.columns:
    heart_disease_column_dropped = heart_disease_df.drop(['oldpeak'], axis=1)
    print("✓ Видалено стовпець 'oldpeak'")
else:
    heart_disease_column_dropped = heart_disease_df.copy()
    print("⚠️  Стовпець 'oldpeak' не знайдено")

# Видалення дублікатів
heart_disease_duplicates_dropped = heart_disease_column_dropped.drop_duplicates()
print(f"✓ Видалено дублікатів: {len(heart_disease_column_dropped) - len(heart_disease_duplicates_dropped)}")

# Заповнення пропущених значень у стовпці 'restecg'
if 'restecg' in heart_disease_duplicates_dropped.columns:
    mean_value = heart_disease_duplicates_dropped['restecg'].mean()
    heart_disease_duplicates_dropped['restecg'].fillna(mean_value, inplace=True)
    print(f"✓ Заповнено пропущені значення в 'restecg' середнім: {mean_value:.2f}")
    print(f"✓ Пропущених значень в 'restecg': {heart_disease_duplicates_dropped['restecg'].isna().any()}")

# ==================== РОЗДІЛЕННЯ ДАНИХ ====================
print("\n" + "="*70)
print("РОЗДІЛЕННЯ НА ТРЕНУВАЛЬНУ ТА ТЕСТОВУ ВИБІРКИ")
print("="*70)

# Створення копії для уникнення попереджень
heart_disease_clean = heart_disease_duplicates_dropped.copy()

X_train, X_test = train_test_split(heart_disease_clean, test_size=0.2, random_state=42)
print(f"✓ Тренувальна вибірка: {X_train.shape}")
print(f"✓ Тестова вибірка: {X_test.shape}")

# ==================== НОРМАЛІЗАЦІЯ ====================
print("\n" + "="*70)
print("НОРМАЛІЗАЦІЯ ДАНИХ")
print("="*70)

# Normalizer
norm = Normalizer()
X_train_norm = norm.fit_transform(X_train)
X_test_norm = norm.transform(X_test)
print("✓ Нормалізація за допомогою Normalizer завершена")

# ==================== ВІЗУАЛІЗАЦІЯ РОЗПОДІЛУ ВІКУ ====================
print("\n" + "="*70)
print("ВІЗУАЛІЗАЦІЯ РОЗПОДІЛУ ВІКУ")
print("="*70)

# Оригінальний розподіл
if 'age' in X_train.columns:
    plt.figure(figsize=(12, 6))
    plt.hist(X_train['age'], bins=30, alpha=0.7, label='Оригінальні дані', 
             edgecolor='black', color='#3498db')
    plt.legend(prop={'size': 14}, loc='upper right')
    plt.title('Розподіл віку пацієнтів (оригінальні дані)', 
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Вік (роки)', fontsize=13)
    plt.ylabel('Кількість пацієнтів', fontsize=13)
    plt.grid(alpha=0.3, linestyle='--')
    plt.tight_layout()
    plt.savefig('03_age_original.png', dpi=300, bbox_inches='tight')
    plt.show()

    # ==================== MIN-MAX НОРМАЛІЗАЦІЯ ВІКУ ====================
    print("\n📊 Min-Max нормалізація віку...")
    
    # Створення копій для уникнення попереджень
    X_train_minmax = X_train.copy()
    X_test_minmax = X_test.copy()
    
    normalizer = MinMaxScaler()
    X_train_minmax['age'] = normalizer.fit_transform(X_train_minmax['age'].values.reshape(-1, 1))
    X_test_minmax['age'] = normalizer.transform(X_test_minmax['age'].values.reshape(-1, 1))

    plt.figure(figsize=(12, 6))
    plt.hist(X_test_minmax['age'], bins=30, alpha=0.7, 
             label='Нормалізовані дані (Min-Max)', 
             color='#e67e22', edgecolor='black')
    plt.legend(prop={'size': 14}, loc='upper right')
    plt.title('Розподіл віку після Min-Max нормалізації', 
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Нормалізований вік (0-1)', fontsize=13)
    plt.ylabel('Кількість пацієнтів', fontsize=13)
    plt.grid(alpha=0.3, linestyle='--')
    plt.tight_layout()
    plt.savefig('04_age_minmax.png', dpi=300, bbox_inches='tight')
    plt.show()

    # ==================== СТАНДАРТИЗАЦІЯ ВІКУ ====================
    print("\n📊 Стандартизація віку...")
    
    # Створення копій для уникнення попереджень
    X_train_std = X_train.copy()
    X_test_std = X_test.copy()
    
    standardizer = StandardScaler()
    X_train_std['age'] = standardizer.fit_transform(X_train_std['age'].values.reshape(-1, 1))
    X_test_std['age'] = standardizer.transform(X_test_std['age'].values.reshape(-1, 1))

    plt.figure(figsize=(12, 6))
    plt.hist(X_train_std['age'], bins=30, alpha=0.7, 
             label='Стандартизовані дані (Z-score)', 
             color='#27ae60', edgecolor='black')
    plt.legend(prop={'size': 14}, loc='upper right')
    plt.title('Розподіл віку після стандартизації (Z-score)', 
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Стандартизований вік (Z-score)', fontsize=13)
    plt.ylabel('Кількість пацієнтів', fontsize=13)
    plt.grid(alpha=0.3, linestyle='--')
    plt.tight_layout()
    plt.savefig('05_age_standardized.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Порівняння всіх трьох розподілів
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    axes[0].hist(X_train['age'], bins=30, alpha=0.7, color='#3498db', edgecolor='black')
    axes[0].set_title('Оригінальні дані', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Вік (роки)', fontsize=12)
    axes[0].set_ylabel('Кількість', fontsize=12)
    axes[0].grid(alpha=0.3)
    
    axes[1].hist(X_test_minmax['age'], bins=30, alpha=0.7, color='#e67e22', edgecolor='black')
    axes[1].set_title('Min-Max нормалізація', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Нормалізований вік (0-1)', fontsize=12)
    axes[1].set_ylabel('Кількість', fontsize=12)
    axes[1].grid(alpha=0.3)
    
    axes[2].hist(X_train_std['age'], bins=30, alpha=0.7, color='#27ae60', edgecolor='black')
    axes[2].set_title('Стандартизація (Z-score)', fontsize=14, fontweight='bold')
    axes[2].set_xlabel('Стандартизований вік', fontsize=12)
    axes[2].set_ylabel('Кількість', fontsize=12)
    axes[2].grid(alpha=0.3)
    
    plt.suptitle('Порівняння методів нормалізації віку', 
                 fontsize=16, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig('06_age_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

# ==================== ПІДГОТОВКА ДЛЯ МОДЕЛЮВАННЯ ====================
print("\n" + "="*70)
print("ПІДГОТОВКА ДЛЯ МОДЕЛЮВАННЯ")
print("="*70)

# Визначення X та y
if 'target' in heart_disease_df.columns:
    heart_disease_df_X = heart_disease_df.drop('target', axis=1)
    heart_disease_df_y = heart_disease_df['target']
    print("✓ Цільова змінна: 'target'")
else:
    print("❌ Цільова змінна 'target' не знайдена")
    # Припускаємо, що остання колонка - це цільова змінна
    heart_disease_df_X = heart_disease_df.iloc[:, :-1]
    heart_disease_df_y = heart_disease_df.iloc[:, -1]
    print(f"⚠️  Використовується остання колонка як цільова: '{heart_disease_df.columns[-1]}'")

# Розділення на тренувальну та тестову вибірки
X_train, X_test, y_train, y_test = train_test_split(
    heart_disease_df_X, heart_disease_df_y, test_size=0.2, random_state=42
)
print(f"✓ X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"✓ X_test: {X_test.shape}, y_test: {y_test.shape}")

# ==================== ВІДБІР ОЗНАК (RANDOM FOREST) ====================
print("\n" + "="*70)
print("ВІДБІР ОЗНАК ЗА ДОПОМОГОЮ RANDOM FOREST")
print("="*70)

# Навчання Random Forest
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5, random_state=42)
rf.fit(X_train, y_train)
print("✓ Random Forest модель навчена")

# Відбір ознак
model = SelectFromModel(rf, prefit=True)
X_train_transformed = model.transform(X_train)

original_features = heart_disease_df_X.columns
print(f"\n📋 Оригінальні ознаки ({len(original_features)}): {list(original_features)}")

# Вибрані ознаки
features_bool = model.get_support()
selected_features = original_features[features_bool]
print(f"\n✅ Вибрані ознаки ({len(selected_features)}): {list(selected_features)}")

# Візуалізація важливості ознак
feature_importance = pd.DataFrame({
    "feature": selected_features,
    "importance": rf.feature_importances_[features_bool]
}).sort_values('importance', ascending=True)

# Переклад назв ознак на українську
feature_importance['feature_uk'] = feature_importance['feature'].map(
    lambda x: feature_names_uk.get(x, x)
)

plt.figure(figsize=(12, 8))
bars = plt.barh(feature_importance["feature_uk"], feature_importance["importance"], 
                color='steelblue', edgecolor='black', linewidth=1.2)
plt.xlabel('Важливість ознаки', fontsize=13, fontweight='bold')
plt.ylabel('Ознака', fontsize=13, fontweight='bold')
plt.title('Важливість вибраних медичних ознак для прогнозування хвороби серця', 
          fontsize=15, fontweight='bold', pad=20)
plt.grid(alpha=0.3, axis='x', linestyle='--')

# Додавання значень на стовпчики
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2.,
            f'{width:.3f}', ha='left', va='center', fontsize=10, 
            fontweight='bold', color='darkblue')

plt.tight_layout()
plt.savefig('07_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# ==================== SVM МОДЕЛЬ ====================
print("\n" + "="*70)
print("НАВЧАННЯ SVM МОДЕЛІ")
print("="*70)

# Підготовка даних для SVM
heart_disease_X = heart_disease_df_X.copy()
heart_disease_y = heart_disease_df_y.copy()

X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(
    heart_disease_X, heart_disease_y, test_size=0.2, random_state=42
)

# Навчання SVM моделі
svc_model = SVC(kernel='linear', random_state=42)
svc_model.fit(X_train_svm, y_train_svm)
print("✓ SVM модель навчена")

# Прогнози
y_pred_svm = svc_model.predict(X_test_svm)
print(f"\n📊 Перші 10 прогнозів: {y_pred_svm[:10]}")

# Точність
accuracy_svm = svc_model.score(X_test_svm, y_test_svm)
print(f"✓ Точність SVM моделі: {accuracy_svm:.4f}")

# ==================== LOGISTIC REGRESSION ====================
print("\n" + "="*70)
print("НАВЧАННЯ LOGISTIC REGRESSION")
print("="*70)

lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
print("✓ Logistic Regression модель навчена")

print(f"\n📊 Коефіцієнти моделі:")
for idx, (feature, coef) in enumerate(zip(X_train.columns, lr_model.coef_[0])):
    feature_uk = feature_names_uk.get(feature, feature)
    print(f"   {feature_uk}: {coef:.4f}")
print(f"\n📊 Intercept (зміщення): {lr_model.intercept_[0]:.4f}")

accuracy_lr = lr_model.score(X_test, y_test)
print(f"✓ Точність Logistic Regression: {accuracy_lr:.4f}")

# Візуалізація коефіцієнтів
coef_df = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': lr_model.coef_[0]
}).sort_values('coefficient', ascending=True)

coef_df['feature_uk'] = coef_df['feature'].map(lambda x: feature_names_uk.get(x, x))

plt.figure(figsize=(12, 8))
colors = ['#e74c3c' if x < 0 else '#27ae60' for x in coef_df['coefficient']]
bars = plt.barh(coef_df['feature_uk'], coef_df['coefficient'], 
                color=colors, edgecolor='black', linewidth=1.2)
plt.xlabel('Значення коефіцієнта', fontsize=13, fontweight='bold')
plt.ylabel('Ознака', fontsize=13, fontweight='bold')
plt.title('Коефіцієнти логістичної регресії для прогнозування хвороби серця', 
          fontsize=15, fontweight='bold', pad=20)
plt.axvline(x=0, color='black', linestyle='--', linewidth=1.5)
plt.grid(alpha=0.3, axis='x', linestyle='--')

# Додавання значень на стовпчики
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2.,
            f'{width:.3f}', ha='left' if width > 0 else 'right', 
            va='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig('08_logistic_coefficients.png', dpi=300, bbox_inches='tight')
plt.show()

# ==================== K-FOLD CROSS-VALIDATION ====================
print("\n" + "="*70)
print("K-FOLD КРОС-ВАЛІДАЦІЯ")
print("="*70)

# Створення KFold об'єкта
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Перший розділ
train_data_split, test_data_split = next(kfold.split(heart_disease_df_X))

print(f"📊 Кількість точок у оригінальному наборі: {len(heart_disease_df_X)}")
print(f"📊 Кількість точок у тренувальному розділі: {len(train_data_split)}")
print(f"📊 Кількість точок у тестовому розділі: {len(test_data_split)}")

# Крос-валідація для Logistic Regression
print("\n📊 Крос-валідація для Logistic Regression...")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
score = cross_val_score(lr_model, heart_disease_df_X, heart_disease_df_y, 
                        scoring='balanced_accuracy', cv=kf)
print(f"✓ Оцінки крос-валідації: {score}")
print(f"✓ Середня оцінка: {score.mean():.4f} (+/- {score.std():.4f})")

# Візуалізація результатів крос-валідації
plt.figure(figsize=(12, 6))
folds = [f'Fold {i+1}' for i in range(len(score))]
bars = plt.bar(folds, score, color='#3498db', edgecolor='black', linewidth=1.5, alpha=0.7)
plt.axhline(y=score.mean(), color='red', linestyle='--', linewidth=2, 
            label=f'Середнє: {score.mean():.4f}')
plt.xlabel('Номер fold', fontsize=13, fontweight='bold')
plt.ylabel('Збалансована точність', fontsize=13, fontweight='bold')
plt.title('Результати 5-fold крос-валідації (Logistic Regression)', 
          fontsize=15, fontweight='bold', pad=20)
plt.ylim([0, 1.0])
plt.legend(fontsize=12)
plt.grid(alpha=0.3, axis='y', linestyle='--')

# Додавання значень на стовпчики
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.3f}', ha='center', va='bottom', 
            fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('09_cross_validation.png', dpi=300, bbox_inches='tight')
plt.show()

# ==================== CONFUSION MATRIX ====================
print("\n" + "="*70)
print("МАТРИЦЯ ПЛУТАНИНИ")
print("="*70)

# Навчання на всіх даних для confusion matrix
lr_full = LogisticRegression(random_state=42, max_iter=1000)
lr_full.fit(heart_disease_df_X, heart_disease_df_y)
y_pred_full = lr_full.predict(heart_disease_df_X)

cm = confusion_matrix(heart_disease_df_y, y_pred_full)
print("\n📊 Матриця плутанини:")
print(cm)

# Візуалізація матриці плутанини
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True, 
            square=True, linewidths=2, linecolor='black',
            annot_kws={'size': 16, 'weight': 'bold'},
            xticklabels=['Немає хвороби\n(0)', 'Є хвороба\n(1)'],
            yticklabels=['Немає хвороби\n(0)', 'Є хвороба\n(1)'],
            cbar_kws={'label': 'Кількість передбачень'})
plt.title('Матриця плутанини для прогнозування серцевих захворювань', 
          fontsize=16, fontweight='bold', pad=20)
plt.ylabel('Справжні значення', fontsize=14, fontweight='bold')
plt.xlabel('Прогнозовані значення', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('10_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Детальна візуалізація з відсотками
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Абсолютні значення
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1, 
            square=True, linewidths=2, linecolor='black',
            annot_kws={'size': 14, 'weight': 'bold'},
            xticklabels=['Немає хвороби', 'Є хвороба'],
            yticklabels=['Немає хвороби', 'Є хвороба'],
            cbar_kws={'label': 'Кількість'})
ax1.set_title('Абсолютні значення', fontsize=14, fontweight='bold', pad=15)
ax1.set_ylabel('Справжні значення', fontsize=12, fontweight='bold')
ax1.set_xlabel('Прогнозовані значення', fontsize=12, fontweight='bold')

# Відсоткові значення
sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap='Greens', ax=ax2, 
            square=True, linewidths=2, linecolor='black',
            annot_kws={'size': 14, 'weight': 'bold'},
            xticklabels=['Немає хвороби', 'Є хвороба'],
            yticklabels=['Немає хвороби', 'Є хвороба'],
            cbar_kws={'label': 'Відсоток (%)'})
ax2.set_title('Відсоткові значення', fontsize=14, fontweight='bold', pad=15)
ax2.set_ylabel('Справжні значення', fontsize=12, fontweight='bold')
ax2.set_xlabel('Прогнозовані значення', fontsize=12, fontweight='bold')

plt.suptitle('Порівняльний аналіз матриці плутанини', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('11_confusion_matrix_detailed.png', dpi=300, bbox_inches='tight')
plt.show()

# ==================== ПОРІВНЯННЯ МОДЕЛЕЙ ====================
print("\n" + "="*70)
print("ПОРІВНЯННЯ МОДЕЛЕЙ")
print("="*70)

# Створення графіка порівняння точності моделей
models_names = ['SVM\n(лінійне ядро)', 'Logistic\nRegression', 'Крос-валідація\n(середнє)']
accuracies = [accuracy_svm, accuracy_lr, score.mean()]
colors_models = ['#3498db', '#e74c3c', '#27ae60']

plt.figure(figsize=(12, 7))
bars = plt.bar(models_names, accuracies, color=colors_models, 
               edgecolor='black', linewidth=2, alpha=0.8)
plt.ylabel('Точність (Accuracy)', fontsize=13, fontweight='bold')
plt.xlabel('Модель машинного навчання', fontsize=13, fontweight='bold')
plt.title('Порівняння точності різних моделей прогнозування', 
          fontsize=16, fontweight='bold', pad=20)
plt.ylim([0.7, 1.0])
plt.grid(alpha=0.3, axis='y', linestyle='--')

# Додавання значень на стовпчики
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.4f}\n({height*100:.2f}%)', 
            ha='center', va='bottom', 
            fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('12_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# ==================== ФІНАЛЬНИЙ ЗВІТ ====================
print("\n" + "="*70)
print("ФІНАЛЬНИЙ ЗВІТ")
print("="*70)
print(f"\n✅ Аналіз завершено успішно!")
print(f"📊 Датасет: {heart_disease_df.shape[0]} записів, {heart_disease_df.shape[1]} ознак")
print(f"🎯 Точність SVM: {accuracy_svm:.4f} ({accuracy_svm*100:.2f}%)")
print(f"🎯 Точність Logistic Regression: {accuracy_lr:.4f} ({accuracy_lr*100:.2f}%)")
print(f"🎯 Середня крос-валідаційна оцінка: {score.mean():.4f} ({score.mean()*100:.2f}%)")
print(f"\n💾 Збережено {12} візуалізацій у форматі PNG")
print("\n" + "="*70)

import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Налаштування для української мови на графіках
plt.rcParams['font.family'] = 'DejaVu Sans'

# Завантаження даних
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")
df = pd.read_csv(f"{path}/heart.csv")

# Розділення на X та y
X = df.drop('target', axis=1)
y = df['target']

# Розділення на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ========== ЗАВДАННЯ 1: ПОРІВНЯННЯ МЕТОДІВ ==========

# Вибираємо ознаку для візуалізації
feature = 'age'

# Застосовуємо три методи
minmax = MinMaxScaler()
standard = StandardScaler()
robust = RobustScaler()

age_minmax = minmax.fit_transform(X_train[[feature]])
age_standard = standard.fit_transform(X_train[[feature]])
age_robust = robust.fit_transform(X_train[[feature]])

# Візуалізація порівняння
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Оригінальні дані
axes[0, 0].hist(X_train[feature], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 0].set_title('Оригінальні дані', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Вік (роки)', fontsize=12)
axes[0, 0].set_ylabel('Частота', fontsize=12)
axes[0, 0].grid(alpha=0.3)

# Min-Max
axes[0, 1].hist(age_minmax, bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_title('Min-Max нормалізація', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Нормалізований вік [0-1]', fontsize=12)
axes[0, 1].set_ylabel('Частота', fontsize=12)
axes[0, 1].grid(alpha=0.3)

# Стандартизація
axes[1, 0].hist(age_standard, bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].set_title('Стандартизація (Z-score)', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Стандартизований вік', fontsize=12)
axes[1, 0].set_ylabel('Частота', fontsize=12)
axes[1, 0].grid(alpha=0.3)

# Robust Scaler
axes[1, 1].hist(age_robust, bins=30, edgecolor='black', alpha=0.7, color='purple')
axes[1, 1].set_title('Robust Scaler', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Robust scaled вік', fontsize=12)
axes[1, 1].set_ylabel('Частота', fontsize=12)
axes[1, 1].grid(alpha=0.3)

plt.suptitle('Порівняння методів нормалізації для віку', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('normalization_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# ========== ЗАВДАННЯ 2: ВПЛИВ НА МОДЕЛЬ ==========

# Підготовка різних версій даних
scalers = {
    'Без нормалізації': None,
    'Min-Max': MinMaxScaler(),
    'Стандартизація': StandardScaler(),
    'Robust Scaler': RobustScaler()
}

results = []

for name, scaler in scalers.items():
    # Підготовка даних
    if scaler is None:
        X_train_scaled = X_train
        X_test_scaled = X_test
    else:
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    
    # Навчання моделі
    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train_scaled, y_train)
    
    # Прогноз
    y_pred = model.predict(X_test_scaled)
    
    # Точність
    accuracy = accuracy_score(y_test, y_pred)
    
    results.append({
        'Метод': name,
        'Точність': accuracy
    })
    
    print(f"{name}: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Візуалізація результатів
results_df = pd.DataFrame(results)

plt.figure(figsize=(12, 6))
bars = plt.bar(results_df['Метод'], results_df['Точність'], 
               color=['steelblue', 'orange', 'green', 'purple'],
               edgecolor='black', linewidth=1.5, alpha=0.8)

plt.title('Порівняння точності Logistic Regression з різними методами нормалізації', 
          fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Метод нормалізації', fontsize=12, fontweight='bold')
plt.ylabel('Точність (Accuracy)', fontsize=12, fontweight='bold')
plt.ylim([0.7, 1.0])
plt.grid(alpha=0.3, axis='y', linestyle='--')
plt.xticks(rotation=15, ha='right')

# Додавання значень на стовпчики
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.4f}\n({height*100:.1f}%)',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('model_accuracy_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# ========== ВИСНОВКИ ==========
print("\n" + "="*60)
print("ВИСНОВКИ")
print("="*60)

best_result = results_df.loc[results_df['Точність'].idxmax()]
print(f"\nНайкращий результат: {best_result['Метод']}")
print(f"Точність: {best_result['Точність']:.4f} ({best_result['Точність']*100:.2f}%)")

improvement = (best_result['Точність'] - results_df[results_df['Метод'] == 'Без нормалізації']['Точність'].values[0]) * 100
print(f"Покращення відносно базової моделі: {improvement:.2f}%")