Star type, Temperature, Luminosity, a galactic 3D-map and more

# Start coding here...

Use the hipparcos mission data to plot the Hertzsprung Russell

diagram and from it estimate the mass, radius, luminosity, effective temperature. Also make a 3D-map of the bodies covered in this catalogue, map the density distribution, map the velocity profile, map the energy density, estimate the distribution of star-type per quadrant.

I am a novice and this is my first publicly available

data science project. As the project is big, I decided to split it into at least two parts, this being the first one.

Here I shall download the .csv.gz file from the ESA repository

untar it in a .csv file, get the relevant parameters, calculate the interest variables, estimate new variables from this calculated variables, plot the 3D-map given two distance limits, make some plots to understand the data

# libraries
import gzip
import urllib.request
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
import math
from scipy.optimize import curve_fit
import scipy.stats as stats
#----------------------------------------------------------------
### useful functions 
def poly(x, vec):
    '''Return polinomial function
    ______________________
    Takes: the number which the function is calculated(x: a float) and
    the vector of coefficients (vec: an array)
    ______________________
    Returns: the value of the polynimial at the point x (a float) '''
    temp = 0
    for coef in range(len(vec)):
        temp = temp + vec[coef]*x**coef
    return temp     
def point_distance(p_1, p_2):
    '''Calculate the distance between two points in a 2d environment
    ______________________
    Takes: two tuples containig (x, y) locations of each
    ______________________
    Returns: the distance between these points(a float) '''
    x_1, y_1 = p_1
    x_2, y_2 = p_2
    return np.sqrt((x_1 - x_2)**2 + (y_1 + y_2)**2)
def weigths(x , y, limits): 
    '''Calculate the weigths (weigthed probability) of the star types as the inverse 
    of the distanceto the star type lines in the pre-defined limits
    ______________________
    Takes: the coordinates of the point and the star type lines limits 
    ______________________
    Returns: a normalized vector of the distance of the point to each star type
    lines (a float), which may be interpreted as probabilities '''
    distances = []
    for limit in limits:
        x_il, x_sl, coef = limit
        if x >= x_il and x <= x_sl : 
            distances.append(1/max(abs(y - poly(x, vec=coef)), 0.01))
        elif x < x_il:
            distances.append(1/max(point_distance((x, y), (x_il, poly(x_il, vec=coef))), 0.01))
        elif x > x_sl:
            distances.append(1/max(point_distance((x, y), (x_sl, poly(x_sl, vec=coef))), 0.01))
    return distances/np.sum(distances)
#----------------------------------------------------------------------

Libraries and some useful functions loaded, let us write some functions to download and process the data

####### functions to import the data from the ESA website and preprocessing ########
def web_download(url_file, filename):
    """Download the file named filename
       ____________
       Args (str): the url addres of the file       
       ____________
       Returns (str): name pf the dowloaded data 
    """
    try:
        urllib.request.urlretrieve(url_file, filename)
        return(filename)
    except:
        print('NonExistentFile')
        return False 
def gz_data_extraction(url, output):
    """Extract a .gz data from the web apri uncompress it
       ____________
       Args (str): its url adress 
       ____________
       Returns (str): None, bu it saves the file in another file
    """
    # Download archive
    try:
        # Read the file inside the .gz archive located at url
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=response) as uncompressed:
                file_content = uncompressed.read()
        # write to file in binary mode 'wb'
        with open(output, 'wb') as f:
            f.write(file_content)
            return 0
    except Exception as e:
        print(e)
        return None
def dat_to_dataframe(filename):
    """Eliminate the empty spaces of each line and import the data as a dataframe
       ____________
       Args (str): the name of the file  
       ____________
       Returns (str): returns a dataframe containing the data in the file
    """
    file = open(filename, "r")
    file_lines = file.readlines()
    new = []
    for i in range(len(file_lines)):
        temp_line = re.sub("\s+", ",", file_lines[i].strip())
        values = temp_line.split(',')
        #according to the documentation, the lines beyond the 27-th are entries of a matrix, which we will not use
        new.append(values[0:27])
    file.close()
    # data taken from the Read me file 
    names = ['HIP', 'Sn', 'So', 'Nc', 'RArad', 'DErad', 'Plx', 'pmRA', 'pmDE', 'e_RArad', \
    'e_DErad', 'e_Plx', 'e_pmRA', 'e_pmDE', 'Ntr', 'F2', 'F1', 'var', 'ic', 'Vmag', \
    'e_Hpmag', 'sHp', 'VA', 'B-V', 'e_B-V', 'V-I', 'UW']
    return  pd.DataFrame(new, columns = names)
def preprocessing():
    """bundle togheter the preprocessing steps
       ____________
       Args (str): None
       ____________
       Returns (str): a dataframe of processed data 
    """
    # main file download
    gz_data_extraction('https://cdsarc.cds.unistra.fr/ftp/I/311/hip2.dat.gz', 'hip2.dat')
    print('gz downloaded from the web')
    # read me download 
    web_download('https://cdsarc.cds.unistra.fr/ftp/I/311/ReadMe', "ReadMe_hipparcos_2")
    print('Read me downloaded from the web')
    df = dat_to_dataframe("./hip2.dat")
    print('data transformed to dataframe')
    df = df[['B-V','Vmag', 'Plx', 'RArad', 'DErad']] 
    df = df.astype(float)
    print('B-V','Vmag', 'Plx', 'RArad', 'DErad features selected')
    temp_size = df.shape[0]
    df = df[abs(df['Plx']) >= 0.0001] 
    per_of_del_data = (1 - df.shape[0]/temp_size)*100
    print("Far away objects were deleted (Plx approx 0). A total of {var}% of the data was lost due this procedure.".format(var=per_of_del_data))
    df = df.reset_index()
    return df
#-----------------------------------------------------------------------

To check the data processing, let me define some easy to use data visualization functions to check the sanity of the data

#-----------------------------------------------------------------------
### plot histograms 
def plot_histograms(df, variables, n_rows, n_cols) : 
    """Plot histograms of the data
       ____________
       Args (str): the dataframe (df), the variables to plot (variables) the number of rows
       and columns to plot (n_rows, n_columns)
       ____________
       Returns (str): None
    """
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=80,ax=ax, color='black')
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()    
    return None
def print_NaNs(df) : 
    """Plot histograms of the relative number of NaNs. The plot presents a horizintal line 
    at the heigth 5% so that it is ok the delete the data if the bar is below this line
       ____________
       Args: the dataframe (df)
       ____________
       Returns : None
    """
    NaSum = df.isna().mean().values  
    p = sns.barplot(y=NaSum, x=df.columns, color='black')
    p.axhline(y=0.05, linestyle = '--', color = 'red' )
    plt.xticks(rotation=90)
    plt.show()
    return None
#-----------------------------------------------------------------------

Now, functions to calculate the features that can be taken from the data alone. Also is present a function to plot the HR diagram, a function to interpolate the typical region of each star type and a function to get the coefficients of a polynomial interpolation to those regions.

#-----------------------------------------------------------------------
### Add calculable properties
def add_distance(df) : 
    """Calculate the distances from the object using the formula 
    D = 1/plx * 1000 * 3,26156 (in ly) and include 
    in the dataframe
       ____________
       Args: the dataframe (df)
       ____________
       Returns: the dataframe with the additinal column
    """
    df['Distance'] = abs(3.26156*1000/np.array(df["Plx"], dtype=float))
    return df 
def add_cartesian_coordinates(df) :
    """Calculate the cartesian coordinates of the objects for posterior plotting
       ____________
       Args: the dataframe (df)
       ____________
       Returns: the dataframe with three additinal columns (the x,y,z coordinates)
    """
    r = np.array(df['Distance'])
    cos_theta =  np.array(np.cos(df['DErad']))
    sin_theta =  np.array(np.sin(df['DErad']))
    cos_phi = np.array(np.cos((df['RArad'])))
    sin_phi = np.array(np.sin(df['RArad']))
    df['x_pos'] = r * cos_theta * sin_phi
    df['y_pos'] = r * sin_theta * sin_phi
    df['z_pos'] = r * cos_phi
    return df
def add_abs_magnitude(df) :
    """Calculate the absolute magnitude using the formula  M = m + 5 - 5LOG_10(d(pc))
    notice the distance in parsecs
       ____________
       Args: the dataframe (df)
       ____________
       Returns: the dataframe with the additinal column, the absolute magnitude
    """
    df['AbsMag'] = np.array(df["Vmag"]) + 2.43287314 -5*np.log10(np.array(df["Distance"])/3.26156)
    return df
#-------------------------------------------------------------------------
### Plot the HR-Diagram
def HR_diagram(Magnitude_B_V, Magnitude_absoluta) :
    """Plot the HR-diagram 
       ____________
       Args: the B_V magnitude feature and the absolute magnitude feature
       ____________
       Returns: None
    """
    data = pd.DataFrame({'X': Magnitude_B_V, 'Y': Magnitude_absoluta})
    F1 = {"family": "serif", "weight": "bold", "color": "black", "size": 18}
    F2 = {"family": "serif", "weight": "bold", "color": "black", "size": 20}
    norm = mpl.colors.Normalize(vmin=Magnitude_B_V.min(), vmax=Magnitude_B_V.max())
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.scatter(data['X'], data['Y'], c=data['X'],  cmap = "YlOrRd",\
        marker = '.', s = 2, linewidth = 0.0001, norm=norm)
    plt.xlabel("Color Index (B-V)", fontdict = F1)
    plt.ylabel("Absolute Magnitude (Mv)", fontdict = F1)
    plt.title("Hertzsprung-Russell diagram", fontdict = F2)
    plt.xlim(Magnitude_B_V.min(), Magnitude_B_V.max())
    plt.ylim(Magnitude_absoluta.max(), Magnitude_absoluta.min()) 
    ax.xaxis.set_minor_locator(AutoMinorLocator())
    ax.yaxis.set_minor_locator(AutoMinorLocator())
    Color_BC = plt.gca()
    Color_BC.set_facecolor("darkblue")
    plt.tight_layout()
    plt.show()
def HR_diagram_regions(sc_x, sc_y) :
    """Plot the HR-diagram, plot the limits of the regions considered to each star type 
    in the HR diagram, plot a polynomial regression  of each subset considering the limits
    and return those regressions cefficients
       ____________
       Args: The B_V magnitude and the Absolute magnitude (the same used to plot the HR diagram)
       ____________
       Returns: 7 arrays, each containing the parameters of the polynomial coefficients to fit
    the HR diagram and define the Star type regions
    """
    ### plot the points
    data = pd.DataFrame({'X': sc_x, 'Y': sc_y})
    F1 = {"family": "serif", "weight": "bold", "color": "black", "size": 18}
    F2 = {"family": "serif", "weight": "bold", "color": "black", "size": 20}
    norm = mpl.colors.Normalize(vmin=sc_x.min(), vmax=sc_x.max())
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.scatter(data['X'], data['Y'], c=data['X'],  cmap = "YlOrRd",\
        marker = '.', s = 2, linewidth = 0.0001, norm=norm)
    plt.xlabel("Color Index (B-V)", fontdict = F1)
    plt.ylabel("Absolute Magnitude (Mv)", fontdict = F1)
    plt.title("Hertzsprung-Russell diagram", fontdict = F2)
    plt.xlim(sc_x.min(), sc_x.max())
    plt.ylim(sc_y.max(), sc_y.min()) 
    ax.xaxis.set_minor_locator(AutoMinorLocator())
    ax.yaxis.set_minor_locator(AutoMinorLocator())
    Color_BC = plt.gca()
    Color_BC.set_facecolor("darkblue")
    plt.tight_layout()
    # useful polynomials 
    def func_2(x, a_0, a_1, a_2):
        return a_0 + a_1*x + a_2*x**2
    def func_3(x, a_0, a_1, a_2, a_3):
        return a_0 + a_1*x + a_2*x**2 + a_3*x**3
    def func_4(x, a_0, a_1, a_2, a_3, a_4):
        return a_0 + a_1*x + a_2*x**2 + a_3*x**3 + a_4*x**4
    def func_10(x, a_0, a_1, a_2, a_3, a_4, a_5, a_6, a_7, a_8, a_9, a_10):
        return a_0 + a_1*x + a_2*x**2 + a_3*x**3 + a_4*x**4 + a_5*x**5 + a_6*x**6 +\
            a_7*x**7 + a_8*x**8+ a_9*x**9 + a_10*x**10
    ### fitting degree 2 polynomial on the white dwarfs region
    f_test = lambda x: 5*x + 6
    test = pd.DataFrame({'X' : np.linspace(-0.4, 0.8, num = 50)})
    test['Y'] = test['X'].map(f_test)
    #sns.lineplot(x=test['X'], y=test['Y'],color='red', ax=ax)  
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit'] = new_df['x_0'].map(f_test)
    new_df = new_df[new_df['y_0']>new_df['limit']]
    #sns.scatterplot(x=new_df['x_0'], y=new_df['y_0'], marker='*', color='gold')
    popt, pcov = curve_fit(func_2, new_df['x_0'], new_df['y_0'])
    test['Z'] = test['X'].map(lambda x: popt[0] + popt[1]*x + popt[2]*x**2)
    sns.lineplot(x=test['X'], y=test['Z'],color='white', ax=ax)  
    popt_wd  = popt.copy()
    ### fitting degree 4 polynomial on the brown dwarfs region
    test = pd.DataFrame({'X' : np.linspace(1.35, 2.5, num = 50)})
    test['Y_0'] = test['X'].map(lambda x: 15*x - 16.5)
    test['Y_1'] = test['X'].map(lambda x: 4.5*x - 4)
    #sns.lineplot(x=test['X'], y=test['Y_0'],color='red', ax=ax)  
    #sns.lineplot(x=test['X'], y=test['Y_1'],color='red', ax=ax)  
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 15*x - 16.5)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: 4.5*x - 4)
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    new_df = new_df[new_df['y_0']>new_df['limit_1']]
    new_df = new_df[new_df['x_0']>1.35]
    #sns.scatterplot(x=new_df['x_0'], y=new_df['y_0'], marker='*', color='gold')
    #sns.regplot(x=new_df['x_0'], y=new_df['y_0'], order = 4, ax= ax)
    popt, pcov = curve_fit(func_4, new_df['x_0'], new_df['y_0'])
    test['Z'] = test['X'].map(lambda x:  popt[4]*x**4 + popt[3]*x**3 + popt[2]*x**2 + popt[1]*x + popt[0])
    test = test[(test['X'] > 1.35)]
    test = test[(test['X'] < 2.5)]
    sns.lineplot(x=test['X'], y=test['Z'],color='white', ax=ax)  
    popt_bd  = popt.copy()
    # fitting the subdwarf branch with 3th degree polynomial
    test = pd.DataFrame({'X' : np.linspace(-0.4, 3.5, num = 50)})
    test['Y_0'] = test['X'].map(lambda x: 5*x + 6)
    test['Y_1'] = test['X'].map(lambda x: 6.5*x )
    #sns.lineplot(x=test['X'], y=test['Y_0'],color='red', ax=ax) 
    #sns.lineplot(x=test['X'], y=test['Y_1'],color='red', ax=ax) 
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 5*x + 6)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: 6.5*x)
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    new_df = new_df[new_df['y_0']>new_df['limit_1']]
    #sns.scatterplot(x=new_df['x_0'], y=new_df['y_0'], marker='*', color='gold')
    #sns.regplot(x=new_df['x_0'], y=new_df['y_0'], order = 3, ax= ax)
    popt, pcov = curve_fit(func_3, new_df['x_0'], new_df['y_0'])
    test['Z'] = test['X'].map(lambda x:  popt[3]*x**3 + popt[2]*x**2 + popt[1]*x + popt[0])
    test = test[(test['X'] < 1.4)]
    sns.lineplot(x=test['X'], y=test['Z'],color='white', ax=ax)  
    popt_sd  = popt.copy()
    # fitting the Main sequence stars to a 10-th order polynomial
    test = pd.DataFrame({'X' : np.linspace(-0.4, 3.5, num = 50)})
    test['Y_0'] = test['X'].map(lambda x: 6.5*x )
    test['Y_1'] = test['X'].map(lambda x: 6.5*x - 4 )
    #sns.lineplot(x=test['X'], y=test['Y_0'],color='red', ax=ax)  
    #sns.lineplot(x=test['X'], y=test['Y_1'],color='red', ax=ax) 
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 6.5*x)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: 6.5*x - 4)
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    new_df = new_df[new_df['y_0']>new_df['limit_1']]
    new_df = new_df[new_df['x_0']>-0.3]
    #sns.scatterplot(x=new_df['x_0'], y=new_df['y_0'], marker='*', color='gold')
    #sns.regplot(x=new_df['x_0'], y=new_df['y_0'], line_kws={'color':'black'}, scatter_kws={'s':1}, order=10)
    popt, pcov = curve_fit(func_10, new_df['x_0'], new_df['y_0'])
    test['Z'] = test['X'].map(lambda x: popt[10]*x**10 + popt[9]*x**9 + popt[8]*x**8 + popt[7]*x**7 + \
        popt[6]*x**6 + popt[5]*x**5 + popt[4]*x**4 + popt[3]*x**3 + \
        popt[2]*x**2 + popt[1]*x + popt[0])
    test = test[test['X']>-0.3]
    test = test[test['X']<1.8]
    sns.lineplot(x=test['X'], y=test['Z'],color='white', ax=ax)     
    popt_ps  = popt.copy()
    # fitting the subgiant branch with 2th degree polynomial
    test = pd.DataFrame({'X' : np.linspace(0.36, 2, num = 50)})
    test['Y_0'] = test['X'].map(lambda x: -5*(x-1.3)**2 + 3)
    test['Y_1'] = test['X'].map(lambda x: x - 2 )
    #sns.lineplot(x=test['X'], y=test['Y_0'],color='red', ax=ax) 
    #sns.lineplot(x=test['X'], y=test['Y_1'],color='red', ax=ax)
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: -5*(x-1.3)**2 + 3)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: x - 2 )
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    new_df = new_df[new_df['y_0']>new_df['limit_1']]
    #sns.scatterplot(x=new_df['x_0'], y=new_df['y_0'], marker='*', color='gold')
    #sns.regplot(x=new_df['x_0'], y=new_df['y_0'], line_kws={'color':'black'}, scatter_kws={'s':1}, order=2)
    popt, pcov = curve_fit(func_2, new_df['x_0'], new_df['y_0'])
    test['Z'] = test['X'].map(lambda x: popt[2]*x**2 + popt[1]*x + popt[0])
    sns.lineplot(x=test['X'], y=test['Z'],color='white', ax=ax)
    popt_sG  = popt.copy()
    # fitting the giant branch with 4th degree polynomial
    test = pd.DataFrame({'X' : np.linspace(-0.4, 3.2, num = 50)})
    test['Y_0'] = test['X'].map(lambda x: 0.5*x - 6)
    test['Y_1'] = test['X'].map(lambda x: 0.5*x - 3)
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 0.5*x - 6)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: 0.5*x - 3)
    new_df = new_df[new_df['y_0']>new_df['limit_0']]
    new_df = new_df[new_df['y_0']<new_df['limit_1']]
    #sns.scatterplot(x=new_df['x_0'], y=new_df['y_0'], c='red', marker = '*', ax=ax) 
    #sns.lineplot(x=test['X'], y=test['Y_0'],color='red', ax=ax) 
    #sns.lineplot(x=test['X'], y=test['Y_1'],color='red', ax=ax) 
    #sns.regplot(x=new_df['x_0'], y=new_df['y_0'], order = 4)
    #sns.lineplot(x=test['X'], y=test['Y_2'],color='red', ax=ax)
    popt, pcov = curve_fit(func_4, new_df['x_0'], new_df['y_0'])
    test['Z'] = test['X'].map(lambda x: popt[4]*x**4 + popt[3]*x**3 + popt[2]*x**2 + popt[1]*x + popt[0]  )
    test = test[(test['X'] > 0.)]
    #test = test[(test['X'] < 1.9)]
    sns.lineplot(x=test['X'], y=test['Z'],color='white', ax=ax)
    popt_G  = popt.copy()
    # fitting a 2-th polynomial to the Bright Giant Ib branch 
    test = pd.DataFrame({'X' : np.linspace(-0.4, 3.5, num = 50)})
    test['Y_0'] = test['X'].map(lambda x: 0.5*x - 7.)
    test['Y_1'] = test['X'].map(lambda x: 0.5*x - 10.)
    #sns.lineplot(x=test['X'], y=test['Y_0'],color='red', ax=ax) 
    #sns.lineplot(x=test['X'], y=test['Y_1'],color='red', ax=ax) 
    #sns.scatterplot(x=new_df['x_0'], y=new_df['y_0'], c='red', marker = '*', ax=ax) 
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 0.5*x - 7)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: 0.5*x - 10)
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    new_df = new_df[new_df['y_0']>new_df['limit_1']]
    #sns.regplot(x=new_df['x_0'], y=new_df['y_0'], order = 4, line_kws={'color':'black'})
    popt, pcov = curve_fit(func_4, new_df['x_0'], new_df['y_0'])
    f_test = lambda x: popt[2]*x**2 + popt[1]*x + popt[0]  
    test['Z'] = test['X'].map(f_test)
    test = test[(test['X'] > -0.4)]
    test = test[(test['X'] < 3.0)]
    sns.lineplot(x=test['X'], y=test['Z'],color='white', ax=ax)
    popt_SGb  = popt.copy() 
    # fitting a 2-th polynomial to the Bright Giant Ia branch 
    test = pd.DataFrame({'X' : np.linspace(-0.4, 3.5, num = 50)})
    test['Y_0'] = test['X'].map(lambda x: 0.5*x - 10)
    #sns.lineplot(x=test['X'], y=test['Y_0'],color='red', ax=ax) 
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 0.5*x - 10)
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    #sns.lineplot(x=test['X'], y=test['Y_1'],color='red', ax=ax) 
    #sns.scatterplot(x=new_df['x_0'], y=new_df['y_0'], c='red', marker = '*', ax=ax) 
    #sns.regplot(x=new_df['x_0'], y=new_df['y_0'], order = 4) 
    popt, pcov = curve_fit(func_4, new_df['x_0'], new_df['y_0']) 
    test['Z'] = test['X'].map(lambda x: popt[4]*x**4 + popt[3]*x**3 + popt[2]*x**2 + popt[1]*x + popt[0] )
    test = test[(test['X'] > -0.4)]
    test = test[(test['X'] < 3.5)]
    sns.lineplot(x=test['X'], y=test['Z'],color='white', ax=ax)
    popt_SGa  = popt.copy()
    plt.show()
    return popt_wd, popt_ps, popt_bd, popt_sd, popt_sG, popt_G, popt_SGb, popt_SGa
def find_HR_diagram(sc_x, sc_y) :
    """Based on the plot in the HR_diagram_regions(sc_x, sc_y) function, find the coefficients 
    of the fitted polynomials without ploting anything
       ____________
       Args: The B_V magnitude and the Absolute magnitude (the same used to plot the HR diagram)
       ____________
       Returns: 7 arrays, each containing the parameters of the polynomial coefficients to fit
    the HR diagram and define the Star type regions
    """
    # useful polynomials 
    def func_2(x, a_0, a_1, a_2):
        return a_0 + a_1*x + a_2*x**2
    def func_3(x, a_0, a_1, a_2, a_3):
        return a_0 + a_1*x + a_2*x**2 + a_3*x**3
    def func_4(x, a_0, a_1, a_2, a_3, a_4):
        return a_0 + a_1*x + a_2*x**2 + a_3*x**3 + a_4*x**4
    def func_10(x, a_0, a_1, a_2, a_3, a_4, a_5, a_6, a_7, a_8, a_9, a_10):
        return a_0 + a_1*x + a_2*x**2 + a_3*x**3 + a_4*x**4 + a_5*x**5 + a_6*x**6 +\
            a_7*x**7 + a_8*x**8+ a_9*x**9 + a_10*x**10
    ### fitting degree 2 polynomial on the white dwarfs region 8, 5, 0
    f_test = lambda x: 5*x + 6
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit'] = new_df['x_0'].map(f_test)
    new_df = new_df[new_df['y_0']>new_df['limit']]
    popt, pcov = curve_fit(func_2, new_df['x_0'], new_df['y_0'])
    popt_wd  = popt.copy()
    ### fitting degree 4 polynomial on the brown dwarfs region
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 15*x - 16.5)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: 4.5*x - 4)
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    new_df = new_df[new_df['y_0']>new_df['limit_1']]
    new_df = new_df[new_df['x_0']>1.35]
    popt, pcov = curve_fit(func_4, new_df['x_0'], new_df['y_0'])
    popt_bd  = popt.copy()
    # fitting the subdwarf branch with 3th degree polynomial
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 5*x + 6)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: 6.5*x)
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    new_df = new_df[new_df['y_0']>new_df['limit_1']]
    popt, pcov = curve_fit(func_3, new_df['x_0'], new_df['y_0'])
    popt_sd  = popt.copy()
    # fitting the Main sequence stars to a 10-th order polynomial
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 6.5*x)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: 6.5*x - 4)
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    new_df = new_df[new_df['y_0']>new_df['limit_1']]
    new_df = new_df[new_df['x_0']>-0.3]
    new_df = new_df[new_df['x_0']<1.8]
    popt, pcov = curve_fit(func_10, new_df['x_0'], new_df['y_0'])
    popt_ps  = popt.copy()
    # fitting the subgiant branch with 2th degree polynomial
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: -5*(x-1.3)**2 + 3)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: x - 2 )
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    new_df = new_df[new_df['y_0']>new_df['limit_1']]
    popt, pcov = curve_fit(func_2, new_df['x_0'], new_df['y_0'])
    popt_sG  = popt.copy()
    # fitting the giant branch with 4th degree polynomial
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 0.5*x - 6)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: 0.5*x - 3)
    new_df = new_df[new_df['y_0']>new_df['limit_0']]
    new_df = new_df[new_df['y_0']<new_df['limit_1']]
    new_df = new_df[new_df['x_0']>-0.05]
    new_df = new_df[new_df['x_0']<3.5]
    popt, pcov = curve_fit(func_4, new_df['x_0'], new_df['y_0'])
    popt_G  = popt.copy()
    # fitting a 2-th polynomial to the Bright Giant Ib branch 
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 0.5*x - 7)
    new_df['limit_1'] = new_df['x_0'].map(lambda x: 0.5*x - 10)
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    new_df = new_df[new_df['y_0']>new_df['limit_1']]
    popt, pcov = curve_fit(func_4, new_df['x_0'], new_df['y_0'])
    popt_SGb  = popt.copy()
    # fitting a 2-th polynomial to the Bright Giant Ia branch 
    new_df=pd.DataFrame({'x_0': sc_x, 'y_0':sc_y})
    new_df['limit_0'] = new_df['x_0'].map(lambda x: 0.5*x - 10)
    new_df = new_df[new_df['y_0']<new_df['limit_0']]
    popt, pcov = curve_fit(func_4, new_df['x_0'], new_df['y_0']) 
    popt_SGa  = popt.copy()
    return popt_wd, popt_bd, popt_sd, popt_ps, popt_sG, popt_G, popt_SGb, popt_SGa
#--------------------------------------------------------------------------

Now, add the features that we estimate through physical arguments and the star type approximated by the distance the obct is from the characteristic regions in the HR diagram

#--------------------------------------------------------------------------
### Spectral type measurements, beggining of the estimated features
def add_luminosity(df) : 
    """Estimate the luminosity using L/Ls = 2.511886432**(Ms-M), L1_sol = 1, M1_sol = 4.83
       ____________
       Args: the dataframe (df)
       ____________
       Returns: the dataframe with the additinal column, the absolute magnitude
    """
    df['EstimatedLuminosity'] = 2.511886432**(4.83-np.array(df['AbsMag']))
    return df
def add_mass_estimative(df) : 
    """Estimate the mass using mass-luminosity estimatives like 
    class 1 : M < 0.43 M_sun -> L/Ls = 0.23(M/Ms)**2.3
    class 2 : 2 M_sun > M > 0.43 M_sun
    class 3 : 55 M_sun > M > 2 M_sun
    class 4 : M > 55 M_sun
       ____________
        Args: the dataframe (df)
       ____________
        Returns: the dataframe with the additinal column, the absolute magnitude
    """
    def mass_class_1(L) : 
        # L < 0.033
        # L/L_sun = 0.23(M/M_sun)**2.3
        mass_estimative = (4.347826087 * L)**(0.434782609)
        return mass_estimative
    def mass_class_2(L) : 
        #  16 > L > 0.033
        # L/L_sun = (M/M_sun)**4
        mass_estimative = L**(0.25)
        return mass_estimative
    def mass_class_3(L) : 
        # 1 727 418 > L > 16
        # L/L_sun = 1.4 (M/M_sun)**3.5
        mass_estimative = (0.714285714 * L)**(0.285714286)
        return mass_estimative
    def mass_class_4(L) : 
        # L > 1 727 418
        # L/L_sun = 32000 (M/M_sun)
        mass_estimative = 0.00003125 * L
        return mass_estimative
    mass_array = [] 
    for L in df['EstimatedLuminosity'] : 
        if L < 0.033 :
            mass_array.append(mass_class_1(L))
        elif L < 16 :
            mass_array.append(mass_class_2(L))   
        elif L < 1727418 :
            mass_array.append(mass_class_3(L))
        else :
            mass_array.append(mass_class_4(L))  
    df['EstimatedMass'] = mass_array
    return df
def add_radius_estimative(df) :
    """Estimate the radius of the object. For now, assuming that is a main sequence star
       ____________
       Args: the dataframe (df)
       ____________
       Returns: the dataframe with the additinal column, the estimated radius 
    """
    def main_sequence(M):
        if M < 1 : 
            R = M**0.57
        else :
            R = M**0.8
        return R    
    radius_estimates = []
    for M in df['EstimatedMass']: 
        radius_estimates.append(main_sequence(M))
    df['EstimatedRadius'] = radius_estimates
    return df
def add_effective_temperature_estimative(df) :
    """Estimate the effective temperatue using the stefan boltzman law: L = 4 pi R**2 cB T**4
       ____________
       Args: the dataframe (df)
       ____________
       Returns: the dataframe with the additinal column, the estimated effective temperatue
    """
    R_sun = 696340
    cB = 0.0567
    df['EstimatedTemperature'] = (np.array(df['EstimatedLuminosity']*3.846 * 10**26) / (4 * math.pi * (R_sun * np.array(df['EstimatedRadius']))**2 * cB))**(0.25)
    return df
def add_star_type_estimation(df) : 
    """Estimate the star type by its position in the HR diagram
       ____________
       Args: the dataframe (df)
       ____________
       Returns: the dataframe with the additinal column, the estimative of the star type
    """
    #popt_wd, popt_bd, popt_sd, popt_ps, popt_sG, popt_G, popt_SGb, popt_SGa = figure_test(temp['B-V'], temp['AbsMag'])
    popt_wd, popt_bd, popt_sd, popt_ps, popt_sG, popt_G, popt_SGb, popt_SGa =\
        find_HR_diagram(df['B-V'], df['AbsMag'])
    test = pd.DataFrame({'X' : np.linspace(df['B-V'].min(), df['B-V'].max(), num = 100)})
    test['wd_curve']= test['X'].map(lambda x: poly(x, vec=popt_wd))
    test['bd_curve']= test['X'].map(lambda x: poly(x, vec=popt_bd))
    test['swd_curve']= test['X'].map(lambda x: poly(x, vec=popt_sd))
    test['PS_curve']= test['X'].map(lambda x: poly(x, vec=popt_ps))
    test['sG_curve']= test['X'].map(lambda x: poly(x, vec=popt_sG))
    test['G_curve']= test['X'].map(lambda x: poly(x, vec=popt_G))
    test['SGb_curve']= test['X'].map(lambda x: poly(x, vec=popt_SGb))
    test['SGa_curve']= test['X'].map(lambda x: poly(x, vec=popt_SGa))
    wd_limits = (-0.4, 0.8, popt_wd)
    bd_limits = (1.4, 2.5, popt_bd) 
    swd_limits = (-0.4, 1.4, popt_sd)
    PS_limits = (-0.3, 1.8, popt_ps)
    sG_limits = (0.4, 2, popt_sG)
    G_limits = (0., 3.5, popt_G)
    SGb_limits = (-0.4, 3.5, popt_SGb) 
    SGa_limits = (-0.4, 3.5, popt_SGa)
    limits = [wd_limits, bd_limits, swd_limits, PS_limits, sG_limits, G_limits, SGa_limits, SGb_limits]
    p_wd, p_bd, p_sd, p_PS, p_sG, p_G, p_SGb, p_SGa, t = [], [], [], [], [], [], [], [], []
    StarType = ['white_dwarf', 'brown_dwarf','sub_dwarf', 'MS', 'sub_giant',\
        'giant', 'super_giant_b', 'super_giant_a']
    for index in range(df.shape[0]):
        temp = weigths(df.loc[index, 'B-V'], df.loc[index,'AbsMag'], limits)
        p_wd.append(temp[0])
        p_bd.append(temp[1])
        p_sd.append(temp[2])
        p_PS.append(temp[3])
        p_sG.append(temp[4])
        p_G.append(temp[5])
        p_SGb.append(temp[6])
        p_SGa.append(temp[7])
        i = np.where(temp==temp.max())[0][0]
        t.append(StarType[i])
    temp = pd.DataFrame({'p_wd':p_wd, 'p_bd':p_bd, 'p_sd':p_sd, 'p_PS':p_PS, 'p_sG':p_sG,\
        'p_G':p_G, 'p_SGb':p_SGb, 'p_SGa':p_SGa, 'StarType':t})   
    df = pd.concat([df, temp], axis=1)
    return df   
def add_main_sequence_type(df) :
    """Estimate the main sequence type of a main sequence star based on the estimative of the 
    effective temperature
       ____________
       Args: the dataframe (df)
       ____________
       Returns: the dataframe with the StarType column changed to have main sequence star types
    """
    for index in range(df.shape[0]):
        if  df.loc[index, 'StarType'] == 'MS' :
            if df.loc[index, 'EstimatedTemperature'] >= 25000:
                df.loc[index, 'StarType'] = 'MS_O_type'
            elif df.loc[index, 'EstimatedTemperature'] >= 11000:
                df.loc[index, 'StarType'] = 'MS_B_type'
            elif df.loc[index, 'EstimatedTemperature'] >= 7500:
                df.loc[index, 'StarType'] = 'MS_A_type'
            elif df.loc[index, 'EstimatedTemperature'] >= 6000:
                df.loc[index, 'StarType'] = 'MS_F_type'
            elif df.loc[index, 'EstimatedTemperature'] >= 5000:
                df.loc[index, 'StarType'] = 'MS_G_type'
            elif df.loc[index, 'EstimatedTemperature'] >= 3500:
                df.loc[index, 'StarType'] = 'MS_K_type'
            else:     
                df.loc[index, 'StarType'] = 'MS_M_type'
    return df         
#-------------------------------------------------------------------------
### Add properties 
def add_properties(df) : 
    """bundle up the properties wich can be directely derived from the data, 
    such as the absolute magnitude and the distances, or estimatives made with 
    physical arguments, sucha as scaling relations 
       ____________
       Args: the dataframe (df)
       ____________
       Returns: a new dataframe with the new features appended
    """
    df = add_distance(df)
    print('Distances added')
    df = add_cartesian_coordinates(df)
    print('3-dimensional cartesian coordinates added')
    df = add_abs_magnitude(df)
    print('Absolute magnitude added')
    aux = df.shape[0]
    df = df[abs(df['B-V']) > 0.000001]
    print("More data was deleted due poin float errors. An amount of {var}% was deleted".format(var=round((1-df.shape[0]/aux)*100,4)))
    df = df.reset_index()
    df = add_luminosity(df)
    print('Luminosity estimative added')
    df = add_mass_estimative(df)
    print('Mass estimative added')
    df = add_star_type_estimation(df)
    print('Gross star classification estimative added')
    df = add_radius_estimative(df)
    print('Radius estimative added')
    df = add_effective_temperature_estimative(df)
    print('Effective temperatue estimative added')
    df = add_main_sequence_type(df)
    print('Main sequence stars classification estimative added')
    return df
#-------------------------------------------------------------------------

Plot the 3D-map and some plotting functions to make a preliminary analysis of the data

### 3d neighborhood map according to the data
def plot_map(df, lower_distance, upper_distance) : 
    """Plot a 3D-map (which is navigable using the right-left mouse buttons) with 
    color defined by star type and temperature, size define by the star radius and the 
    map centered in the sun
       ____________
       Args: the dataframe (df) and two distance limits, will be plotted all stars
    distant beteween the estipulated limits
       ____________
       Returns: None
    """
    df.sort_values(by='Distance', ascending=True, inplace=True, ignore_index=True)
    temp = df[df['Distance'] < upper_distance]
    temp = temp[temp['Distance'] > lower_distance]
    temp.reset_index(drop=True, inplace=True)
    CMap = []
    Marker = []
    all_bool = []
    for index in range(temp.shape[0]): 
        if temp.loc[index, 'StarType'] == 'white_dwarf':
            CMap.append('lightgray')
            Marker.append('v')
            all_bool.append(True)
        elif temp.loc[index, 'StarType'] == 'brown_dwarf':
            CMap.append('darkorange')
            Marker.append('s')
            all_bool.append(True)  
        elif temp.loc[index, 'StarType'] == 'sub_dwarf':
            CMap.append('cadetblue')
            Marker.append('p')
            all_bool.append(True)  
        elif temp.loc[index, 'StarType'] == 'MS_O_type':
            CMap.append('blue')
            Marker.append('o')
            all_bool.append(True)   
        elif temp.loc[index, 'StarType'] == 'MS_B_type':
            CMap.append('aqua')
            Marker.append('o')
            all_bool.append(True)       
        elif temp.loc[index, 'StarType'] == 'MS_A_type':
            CMap.append('lightseagreen')
            Marker.append('o')
            all_bool.append(True)    
        elif temp.loc[index, 'StarType'] == 'MS_F_type':
            CMap.append('yellow')
            Marker.append('o')
            all_bool.append(True)     
        elif temp.loc[index, 'StarType'] == 'MS_G_type':
            CMap.append('orange')
            Marker.append('o')
            all_bool.append(True)     
        elif temp.loc[index, 'StarType'] == 'MS_K_type':
            CMap.append('orangered')
            Marker.append('o')
            all_bool.append(True)    
        elif temp.loc[index, 'StarType'] == 'MS_M_type':
            CMap.append('red')
            Marker.append('o')
            all_bool.append(True)
        elif temp.loc[index, 'StarType'] == 'sub_giant':
            CMap.append('coral')
            Marker.append('D')
            all_bool.append(True)
        elif temp.loc[index, 'StarType'] == 'giant':
            CMap.append('crimson')
            Marker.append('X')
            all_bool.append(True)   
        elif temp.loc[index, 'StarType'] == 'super_giant_b':
            CMap.append('thistle')
            Marker.append('P')
            all_bool.append(True)      
        elif temp.loc[index, 'StarType'] == 'super_giant_a':
            CMap.append('lavenderblush')
            Marker.append('*')
            all_bool.append(True)
        else:
            all_bool.append(False)     
    temp = temp[all_bool]
    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')
    ax.scatter(0.0, 0.0, 0.0, s=1, marker='*', c='white')
    ax.scatter(temp['x_pos'], temp['y_pos'], temp['z_pos'], s=temp['EstimatedRadius'], \
        alpha = 1, c=CMap, marker = 'o')
    ax.grid(False)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_zticks([])
    plt.axis('off')
    plt.grid(b=None)
    Color_BC = plt.gca()
    Color_BC.set_facecolor("black")
    plt.show()
def plot_bexenplots_degree_distance(df, feat_to_plot, inf_distance, sup_distance):
    """Plot boxenplots of a object feature in a distance interval
       ____________
       Args: the dataframe (df), the feature (to plot) name and two distance limits, will be plotted all stars
    distant between the estipulated limits
       ____________
       Returns: None
    """
    df.sort_values(by='Distance', ascending=True, inplace=True, ignore_index=True)
    temp = df[df['Distance'] <= sup_distance]
    temp = temp[temp['Distance'] >= inf_distance]
    fig, ax = plt.subplots(3,1, figsize=(16, 14))
    plt.subplots_adjust(wspace=.3, hspace=.3)
    g_0 = sns.boxenplot(data=temp, y=feat_to_plot, x = 'RArad', ax=ax[0], color = 'yellow')
    g_1 = sns.boxenplot(data=temp, y=feat_to_plot, x = 'DErad', ax=ax[1], color = 'orange')
    g_2 = sns.boxenplot(data=temp, y=feat_to_plot, x = 'Distance', ax=ax[2], color = 'violet')
    ax[0].set_xticks([-math.pi, -math.pi/2 ,0, math.pi/2, math.pi, 3*math.pi/2], ['180', '270','0', '90', '180', '270'])
    ax[1].set_xticks([-math.pi, -math.pi/2 ,0, math.pi/2, math.pi], ['180', '270','0', '90', '180'])
    ax[0].set_xlabel('Degrees')
    ax[1].set_xlabel('Degrees')
    ax[2].set_xlabel('Distance (ly)')
    plt.show()
    return None
def plot_lineplots_degree_distance(df, feat_to_plot, inf_distance, sup_distance):
    """Plot lineplots of a numerical feature in a distance interval
       ____________
       Args: the dataframe (df), the feature (to plot) name and two distance limits, will be plotted all stars
    distant between the estipulated limits
       ____________
       Returns: None
    """
    df.sort_values(by='Distance', ascending=True, inplace=True, ignore_index=True)
    temp = df[df['Distance'] <= sup_distance]
    temp = temp[temp['Distance'] >= inf_distance]
    fig, ax = plt.subplots(3,1, figsize=(16, 14))
    plt.subplots_adjust(wspace=.3, hspace=.3)
    g_0 = sns.lineplot(data=temp, y=feat_to_plot, x = 'RArad', ax=ax[0], color = 'yellow')
    g_1 = sns.lineplot(data=temp, y=feat_to_plot, x = 'DErad', ax=ax[1], color = 'orange')
    g_2 = sns.lineplot(data=temp, y=feat_to_plot, x = 'Distance', ax=ax[2], color = 'violet')
    ax[0].set_xticks([-math.pi, -math.pi/2 ,0, math.pi/2, math.pi, 3*math.pi/2], ['180', '270','0', '90', '180', '270'])
    ax[1].set_xticks([-math.pi, -math.pi/2 ,0, math.pi/2, math.pi], ['180', '270','0', '90', '180'])
    ax[0].set_xlabel('Degrees')
    ax[1].set_xlabel('Degrees')
    ax[2].set_xlabel('Distance (ly)')
    plt.show()
    return None
#-------------------------------------------------------------------------

‌
‌
‌