Skip to content
# The exclamation mark (!) at the beginning of the line is a special syntax in Jupyter notebooks.
# It allows you to run shell commands directly from a Jupyter notebook cell.
# This is useful for installing packages, managing files, or running other shell commands without leaving the notebook environment.

# In this case, the shell command is `pip install arch`.

# `pip` is the package installer for Python. You can use it to install packages from the Python Package Index (PyPI) and other indexes.
# It is a command-line tool that allows you to install, update, and manage Python packages.

# `install` is a command that tells `pip` to install a package.
# When you use `pip install`, you are instructing pip to download and install the specified package and its dependencies.

# `arch` is the name of the package you want to install. 
# The `arch` package is a library for autoregressive conditional heteroskedasticity (ARCH) models, 
# which are used in time series analysis to model and forecast changing variances.
# ARCH models are particularly useful in financial time series where volatility clustering is observed.

# When you run this cell, Jupyter will execute the shell command to install the `arch` package.
# This means that the `arch` package and its dependencies will be downloaded and installed in your Python environment.
# After installation, you can import and use the `arch` package in your Python code to perform time series analysis.

# Execute the shell command to install the `arch` package
!pip install arch
# Importing project libraries

# Data Manipulation
import pandas as pd  # pandas is used for data manipulation and analysis, providing data structures and operations for manipulating numerical tables and time series.
import numpy as np  # numpy is used for numerical operations on large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.

# Data Visualization
import matplotlib.pyplot as plt  # matplotlib is a plotting library for creating static, animated, and interactive visualizations in Python.
import seaborn as sns  # seaborn is a data visualization library based on matplotlib that provides a high-level interface for drawing attractive and informative statistical graphics.

# Optimization & Statistical Tests
from scipy.optimize import minimize  # minimize is a function from scipy.optimize that is used for minimizing (or maximizing) objective functions.
from scipy.stats import kstest, norm, probplot, anderson  # scipy.stats is a module that contains a large number of probability distributions and statistical functions. 
# kstest: Kolmogorov-Smirnov test for goodness of fit.
# norm: A normal continuous random variable.
# probplot: Generate a probability plot, which is a graphical technique for assessing whether or not a data set follows a given distribution.
# anderson: Anderson-Darling test for data coming from a particular distribution.

# Econometric Models
from arch import arch_model  # arch_model is used for modeling and forecasting time series data with Autoregressive Conditional Heteroskedasticity (ARCH) effects.

# Ensemble Methods libraries
from sklearn.ensemble import RandomForestRegressor  # RandomForestRegressor is an ensemble learning method for regression that operates by constructing a multitude of decision trees at training time and outputting the mean prediction of the individual trees.
import xgboost as xgb  # xgboost is an optimized distributed gradient boosting library designed to be highly efficient, flexible, and portable.
from xgboost import XGBRegressor  # XGBRegressor is a scikit-learn API-compatible class for regression using the XGBoost library.
#dataframe
df = pd.read_csv("crypto_prices.csv")
df.head()
# Convert the "date" column to datetime format
df["date"] = pd.to_datetime(df["date"])

# Split the dataframe into training and testing sets based on the date
# Training set: all data before July 1, 2021
df_train = df[df['date'] < '2021-07-01']

# Testing set: all data from July 1, 2021, onwards
df_test = df[df['date'] >= '2021-07-01']

# Ensure the "date" column in the training set is in datetime format
df_train['date'] = pd.to_datetime(df_train['date'])

# Ensure the "date" column in the testing set is in datetime format
df_test['date'] = pd.to_datetime(df_test['date'])

# Set the "date" column as the index for the dataframe
df.set_index("date", inplace=True)

# Print the column names of the dataframe to verify the structure
print(df.columns)

# Plot time series data for each cryptocurrency in the dataframe
# Each cryptocurrency will have its own subplot
df.plot(subplots=True, figsize=(10, 10))

# Display the plots
plt.show()
#bitcoin prices: training data
df_train.plot(x='date', y='bitcoin')
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Bitcoin Prices - Training Data')
plt.show()


#bitcoin prices: testing data
df_test.plot(x='date', y='bitcoin')
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Bitcoin Prices - Test Data')
plt.show()
plt.figure(figsize=(12, 6))
df_test.plot(x='date', y=['bitcoin', 'bitcoin-cash', 'ethereum', 'ethereum-classic', 'litecoin', 'monero', 'ripple', 'stellar', 'cardano'])
plt.xlabel('Date')
plt.ylabel('Prices')
plt.title('Crypto Prices - Testing Data')
plt.show()
df_train['date'] = pd.to_datetime(df_train['date'])
df_train_returns = df_train.drop(columns=['date']).pct_change()
df_train_returns['date'] = df_train['date']
df_train_returns = df_train_returns.iloc[1:]
df_train_returns.drop(columns=['date']).hist(bins=100, figsize=(15, 10))
plt.tight_layout()
plt.show()
correlation_matrix = df_train_returns.drop(columns=['date']).corr()
df_train_returns.plot(x='date', y=df_train_returns.columns.drop('date'), figsize=(12, 6))
plt.xlabel('Date')
plt.ylabel('Returns')
plt.title('Evolution of Returns')
plt.legend(loc='upper left')
plt.show()
#correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix - Training Data')
plt.show()
#covariance matrix: training data returns
covariance_matrix = df_train_returns.drop(columns=['date']).cov()
plt.figure(figsize=(10, 8))
sns.heatmap(covariance_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Covariance Matrix - Training Data')
plt.show()
df_test_returns = df_test.drop(columns=['date']).pct_change()
df_test_returns['date'] = pd.to_datetime(df_test['date'])
df_test_returns = df_test_returns.iloc[1:]

df_test_returns.drop(columns=['date']).hist(bins=100, figsize=(15, 10))
plt.tight_layout()
plt.show()
df_test_returns.plot(x='date', y=df_test_returns.columns.drop('date'), figsize=(12, 6))
plt.xlabel('Date')
plt.ylabel('Returns')
plt.title('Evolution of Returns')
plt.legend(loc='upper left')
plt.show()
df_test_returns.info()
df_test_returns['date'] = pd.to_datetime(df_test_returns['date'])

df_test_returns.info()
df_test_returns = df_test_returns.dropna()