Skip to content
Prediction of Bitcoin
Prediction of Bitcoin
#pip install cryptocmd
# Data manipulation
# ==============================================================================
import pandas as pd
import numpy as np
import datetime
from cryptocmd import CmcScraper
# Plots
# ==============================================================================
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
plt.style.use('ggplot')
# Bitcoin colors
# ==============================================================================
palette_btc = {'orange': '#f7931a',
'white' : '#ffffff',
'gray' : '#4d4d4d',
'blue' : '#0d579b',
'green' : '#329239'
}
# Modelling and Forecasting
# ==============================================================================
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import backtesting_forecaster
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
Data download
STEPS
- Here I extract the information between certain dates with the "CmcScraper" library
- Data is sorted by the "Date" column so it can be done by ascending way, and replacing the original dataframe by using inplace argument.
- Set_option and reset_option are methods that force python to show when displaying
# Data download
# ==============================================================================
# Scraper is initialized, symbol, start and end of download are included
scraper = CmcScraper('BTC', '28-04-2013', '01-01-2022')
# Transform collected data into a dataframe
data = scraper.get_dataframe()
data.sort_values(by='Date', ascending=True, inplace=True)
pd.set_option('display.max_columns', None)
display(data.head(10))
pd.reset_option('display.max_columns')
Preparing the Data for the Machine Learning model
Steps
- the next line we select ['date', 'Open', 'Close', 'High', 'Low'] columns of the original DataFrame (keep in mind that this reduces the amount of memory needed and could be useful for large dataframes).
- I ask to Python to replace atomatically the whole columns names to be as lower case.
- The format of the date column is changed so this column will be replaced by (years-month-day) (hour:minutes:seconds).
- set_index to date asks to use this entire column as index (the number column at the very left of the table, see above).
Careful: this Jupyter Cell should be executed one time
# Data preparation
# ==============================================================================
# data['date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d %H:%M:%S')
data = data[['Date', 'Open', 'Close', 'High', 'Low']]
# data = data.loc[:, ['date', 'Open', 'Close', 'High', 'Low']]
#name replacement
oldnames = list(data.columns)
newnames = [i.lower() for i in oldnames]
replacement_names = dict(zip(oldnames,newnames))
data = data.rename(replacement_names, axis=1)
#change format
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')
data = data.set_index('date')
- Here we change the frequency of sampling the dataframe in days ('D').
- The asfreq will create new Null values (Null) in the data frame to fill the empty spaces and thus fulfill the task of having a day frecuency, in this case.
- Because we will have Null values in de DF, we have to check if it has Null values or not with isnull.
- Then we sort the indexes.
data = data.asfreq('D')
print('Columns with missing values\n\n',data.isnull().sum())
#another way to do it is: print(f'Number of rows with missing values: {data.isnull().any(axis=1).mean()}')
data = data.sort_index()
Bitcoin halving as an exogenous variable
... It is intended to use the days remaining until the next halving and its mining rewards as exogenous variables to predict the price of Bitcoin. The next halving is estimated to occur approximately in 2024, although its exact date is unknown. The remaining blocks as of 2022-01-14 from the Coinmarketcap website, 121,400, and the average number of Bitcoin network blocks mined per day, 144 (average block time ≈ 10 minutes) are used to determine it.
# Dict with Bitcoin halvings info
# ==============================================================================
btc_halving = {'halving' : [0, 1 , 2, 3, 4],
'date' : ['2009-01-03', '2012-11-28',
'2016-07-09', '2020-05-11', np.nan],
'reward' : [50, 25, 12.5, 6.25, 3.125],
'halving_block_number' : [0, 210000, 420000 ,630000, 840000]
}
# Next halving calculation
# The remaining blocks according to the coinmarketcap.com website for
# the next halving as of 2022-01-14 are taken as a starting point
# ==============================================================================
remaining_blocks = 121400
blocks_per_day = 144
days = remaining_blocks / blocks_per_day
next_halving_initial = '2022-01-14'
a = pd.to_datetime( next_halving_initial, format='%Y-%m-%d' )
#Just adding the last halving date to the next days for the next one
next_halving = a + datetime.timedelta(days=days)
#restarting the clock
next_halving = next_halving.replace(microsecond=0, second=0, minute=0, hour=0)
#The method strftime translate the Timestamp type of date used by pandas
#to be like an string for example Timestamp('2024-05-06 00:00:00') -> '2024-05-06'
next_halving = next_halving.strftime('%Y-%m-%d')
#For the next bitcoin halving info is replacing the last date with the date in the last line code
#but up to now bt_halving is a dictionary
btc_halving['date'][-1] = next_halving
print('The next halving will occur on approximately: {}'.format(next_halving))
# Include rewards and countdown to next halving in dataset
# ==============================================================================
# Creating two new columns totally empty and full of NaN values named "reward" and "countdown_halving"
data['reward'] = np.nan
data['countdown_halving'] = np.nan
for i in range(len(btc_halving['halving'])-1):
# Start and end date of each halving
# condition of the if: filters with the minimum date time
min_date_data = data.index.min().strftime('%Y-%m-%d')
if btc_halving['date'][i] < min_date_data:
"For the all btc_halving dates bellow the minimum and changing the data to the"
start_date = min_date_data
else:
"All the other dates will preserve their dates"
start_date = btc_halving['date'][i]
#
end_date = btc_halving['date'][i+1]
#NOTE:for the shape of the next line "mask" the (%) & (%) are conditions
# where both have to be respected in the (%)
mask = (data.index >= start_date) & (data.index < end_date)
#condition1 = if the data index is bigger or equal to start_date
#condition2 = if data index is smaller than end_date
# Fill column 'reward' with mining rewards
# The mask is applied here and the column reward is filled element after element
data.loc[mask, 'reward'] = btc_halving['reward'][i]
# Fill column 'countdown_halving' with remaining days
# the remaining days to the next halving in date shape
time_to_next_halving = pd.to_datetime(end_date) - pd.to_datetime(start_date)
data.loc[mask, 'countdown_halving'] = np.arange(time_to_next_halving.days)[::-1][:mask.sum()]
display(data)
rew_list = list(data['reward'].unique())
# display( data[data['reward'] == rew_list[0]]['date'])
[display(pd.DataFrame(data.iloc[i])) for i in range(len(data['reward'])) if data['reward'].iloc[i] == rew_list[0] and data['reward'].iloc[i+1] == rew_list[1]];
# Check that the data have been created correctly
# ==============================================================================
print('Second halving:', btc_halving['date'][2])
display(data.loc['2016-07-08':btc_halving['date'][2]])
print('')
print('Third halving:', btc_halving['date'][3])
display(data.loc['2020-05-10':btc_halving['date'][3]])
print('')
print('Next halving:', btc_halving['date'][4])
data.tail(2)