Skip to content
Prediction of Bitcoin

Prediction of Bitcoin

#pip install cryptocmd
# Data manipulation
# ==============================================================================
import pandas as pd
import numpy as np
import datetime
from cryptocmd import CmcScraper

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
plt.style.use('ggplot')

# Bitcoin colors
# ==============================================================================
palette_btc = {'orange': '#f7931a',
               'white' : '#ffffff',
               'gray'  : '#4d4d4d',
               'blue'  : '#0d579b',
               'green' : '#329239'
              }

# Modelling and Forecasting
# ==============================================================================
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import backtesting_forecaster
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

Data download

STEPS

  • Here I extract the information between certain dates with the "CmcScraper" library
  • Data is sorted by the "Date" column so it can be done by ascending way, and replacing the original dataframe by using inplace argument.
  • Set_option and reset_option are methods that force python to show when displaying
# Data download
# ==============================================================================


# Scraper is initialized, symbol, start and end of download are included
scraper = CmcScraper('BTC', '28-04-2013', '01-01-2022')

# Transform collected data into a dataframe
data = scraper.get_dataframe()
data.sort_values(by='Date', ascending=True, inplace=True)

pd.set_option('display.max_columns', None)

display(data.head(10))

pd.reset_option('display.max_columns')

Preparing the Data for the Machine Learning model

Steps

  • the next line we select ['date', 'Open', 'Close', 'High', 'Low'] columns of the original DataFrame (keep in mind that this reduces the amount of memory needed and could be useful for large dataframes).
  • I ask to Python to replace atomatically the whole columns names to be as lower case.
  • The format of the date column is changed so this column will be replaced by (years-month-day) (hour:minutes:seconds).
  • set_index to date asks to use this entire column as index (the number column at the very left of the table, see above).

Careful: this Jupyter Cell should be executed one time

# Data preparation
# ==============================================================================
# data['date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d %H:%M:%S')
data = data[['Date', 'Open', 'Close', 'High', 'Low']]
# data = data.loc[:, ['date', 'Open', 'Close', 'High', 'Low']]
#name replacement
oldnames  = list(data.columns)
newnames  = [i.lower() for i in oldnames]
replacement_names = dict(zip(oldnames,newnames))
data = data.rename(replacement_names, axis=1)
#change format
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')
data = data.set_index('date')
  • Here we change the frequency of sampling the dataframe in days ('D').
  • The asfreq will create new Null values (Null) in the data frame to fill the empty spaces and thus fulfill the task of having a day frecuency, in this case.
  • Because we will have Null values in de DF, we have to check if it has Null values or not with isnull.
  • Then we sort the indexes.
data = data.asfreq('D')
print('Columns with missing values\n\n',data.isnull().sum())
#another way to do it is: print(f'Number of rows with missing values: {data.isnull().any(axis=1).mean()}')
data = data.sort_index()

Bitcoin halving as an exogenous variable

... It is intended to use the days remaining until the next halving and its mining rewards as exogenous variables to predict the price of Bitcoin. The next halving is estimated to occur approximately in 2024, although its exact date is unknown. The remaining blocks as of 2022-01-14 from the Coinmarketcap website, 121,400, and the average number of Bitcoin network blocks mined per day, 144 (average block time ≈ 10 minutes) are used to determine it.

# Dict with Bitcoin halvings info
# ==============================================================================
btc_halving = {'halving'              : [0, 1 , 2, 3, 4],
               'date'                 : ['2009-01-03', '2012-11-28', 
                                         '2016-07-09', '2020-05-11', np.nan],
               'reward'               : [50, 25, 12.5, 6.25, 3.125],
               'halving_block_number' : [0, 210000, 420000 ,630000, 840000]
              }
# Next halving calculation
# The remaining blocks according to the coinmarketcap.com website for 
# the next halving as of 2022-01-14 are taken as a starting point
# ==============================================================================
remaining_blocks = 121400
blocks_per_day   = 144

days = remaining_blocks / blocks_per_day 

next_halving_initial = '2022-01-14'
a   = pd.to_datetime( next_halving_initial, format='%Y-%m-%d' )
#Just adding the last halving date to the next days for the next one
next_halving         = a + datetime.timedelta(days=days)

#restarting the clock 
next_halving = next_halving.replace(microsecond=0, second=0, minute=0, hour=0)

#The method strftime translate the Timestamp type of date used by pandas
#to be like an string  for example Timestamp('2024-05-06 00:00:00') -> '2024-05-06'
next_halving = next_halving.strftime('%Y-%m-%d')

#For the next bitcoin halving info is replacing the last date with the date in the last line code
#but up to now bt_halving is a dictionary
btc_halving['date'][-1] = next_halving


print('The next halving will occur on approximately: {}'.format(next_halving))
# Include rewards and countdown to next halving in dataset
# ==============================================================================
# Creating two new columns totally empty and full of NaN values named "reward" and "countdown_halving"
data['reward']            = np.nan
data['countdown_halving'] = np.nan

for i in range(len(btc_halving['halving'])-1):
     
    # Start and end date of each halving
    # condition of the if:  filters with the minimum date time
    min_date_data  = data.index.min().strftime('%Y-%m-%d')

    if btc_halving['date'][i] < min_date_data:
        "For the all btc_halving dates bellow the minimum and changing the data to the"
        start_date = min_date_data
    else:
        "All the other dates will preserve their dates"
        start_date = btc_halving['date'][i]

    #
    end_date = btc_halving['date'][i+1]
    #NOTE:for the shape of the next line "mask"  the (%) & (%) are conditions
    # where both have to be respected in the (%)
    
    mask = (data.index >= start_date) & (data.index < end_date)
    #condition1 = if the data index is bigger or equal to start_date
    #condition2 = if data index is smaller than end_date
    # Fill column 'reward' with mining rewards
    # The mask is applied here  and the column reward is filled element after element
    data.loc[mask, 'reward'] = btc_halving['reward'][i]
    
    # Fill column 'countdown_halving' with remaining days
    # the remaining days to the next halving in date shape
    time_to_next_halving = pd.to_datetime(end_date) - pd.to_datetime(start_date)
    
    data.loc[mask, 'countdown_halving'] = np.arange(time_to_next_halving.days)[::-1][:mask.sum()]
display(data)
rew_list = list(data['reward'].unique())
# display(    data[data['reward'] ==  rew_list[0]]['date'])
[display(pd.DataFrame(data.iloc[i])) for i in range(len(data['reward'])) if data['reward'].iloc[i] == rew_list[0] and data['reward'].iloc[i+1] == rew_list[1]];
    
# Check that the data have been created correctly 
# ==============================================================================
print('Second halving:', btc_halving['date'][2])
display(data.loc['2016-07-08':btc_halving['date'][2]])
print('')
print('Third halving:', btc_halving['date'][3])
display(data.loc['2020-05-10':btc_halving['date'][3]])
print('')
print('Next halving:', btc_halving['date'][4])
data.tail(2)