Predict model : condp price in bkk

import pandas as pd

station_df = pd.read_excel('station_df.xlsx')

station_df

Landmark_df = pd.read_excel('Landmark_df.xlsx')

Landmark_df

from math import radians, cos, sin, asin, sqrt

def haversine(geoLng1, geoLat1, geoLng2, geoLat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    geoLng1, geoLat1, geoLng2, geoLat2 = map(radians, [geoLng1, geoLat1, geoLng2, geoLat2])

    # haversine formula 
    dLng = geoLng2 - geoLng1 
    dLat = geoLat2 - geoLat1 
    a = sin(dLat/2)**2 + cos(geoLat1) * cos(geoLat2) * sin(dLng/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers
    return c * r

def find_nearest_stations(station_df, geoLat, geoLng, num_stations=3):
    """
    Find the top num_stations nearest train stations to the given latitude and longitude,
    with different line names from each other, along with their distances in kilometers.
    """
    station_data = station_df.copy()
    # calculate the distances from the input coordinates to all stations
    station_data['distance_km'] = station_data.apply(lambda row: haversine(geoLng, geoLat, row['geoLng'], row['geoLat']), axis=1)

    # replace stations that are more than 10 km away with None values
    station_data.loc[station_data['distance_km'] > 8, ['name', 'distance_km', 'lineName']] = [None, None, None]

    # sort the stations by distance
    nearest_stations = station_data.sort_values(by='distance_km')
    
    # create an empty list to store the output stations
    output_stations = []
    
    # loop through the nearest stations and add the top num_stations that are not on the same line as each other
    for _, station in nearest_stations.iterrows():
        if len(output_stations) >= num_stations:
            break
        if station['name'] is not None and all([station['lineName'] != s[2] for s in output_stations]):
            output_stations.append((station['name'], station['distance_km'], station['lineName']))
    
    # replace missing stations with None values
    while len(output_stations) < num_stations:
        output_stations.append((None, None, None))

    return output_stations


def find_top_landmarks_for_points(landmark_df, geoLat, geoLng):
    """
    Find the top num_landmarks nearest train landmarks to the given latitude and longitude,
    with different line names from each other, along with their distances in kilometers.
    """
    landmark_data = landmark_df.copy()
    num_unique_landmark=landmark_data['Type land mark'].nunique()
    num_top_landmarks=1
    
    # calculate the distances from the input coordinates to all landmarks
    landmark_data['distance_km'] = landmark_data.apply(lambda row: haversine(geoLng, geoLat, row['geoLng'], row['geoLat']), axis=1)

    # replace landmarks that are more than 10 km away with None values
    landmark_data.loc[landmark_data['distance_km'] > 8, ['Full Address', 'distance_km', 'Type land mark']] = [None, None, None]

    # sort the landmarks by distance
    sort_landmarks = landmark_data.sort_values(by='distance_km')
    
    # create an empty list to store the output landmarks
    output_landmarks = []
    
    #('Hospital','Department store', 'University')
    i=0
    nearest_Hospital = sort_landmarks.loc[landmark_data['Type land mark']=='Hospital']
    if nearest_Hospital.empty:
        output_landmarks.append((None, None, None))
    else :    
        for _, landmark in nearest_Hospital.iterrows():
            if i >= num_top_landmarks:
                break
            output_landmarks.append((landmark['Full Address'], landmark['distance_km'], landmark['Type land mark']))
            i = i+1
    i=0
    nearest_Department_store = sort_landmarks.loc[landmark_data['Type land mark']=='Department store']
    if nearest_Department_store.empty:
        output_landmarks.append((None, None, None))
    else :   
        for _, landmark in nearest_Department_store.iterrows():
            if i >= num_top_landmarks:
                break
            output_landmarks.append((landmark['Full Address'], landmark['distance_km'], landmark['Type land mark']))
            i = i+1
    i=0
    nearest_University = sort_landmarks.loc[landmark_data['Type land mark']=='University']
    if nearest_University.empty:
        output_landmarks.append((None, None, None))
    else :  
        for _, landmark in nearest_University.iterrows():
            if i >= num_top_landmarks:
                break
            output_landmarks.append((landmark['Full Address'], landmark['distance_km'], landmark['Type land mark']))
            i = i+1

    return output_landmarks

#Test_function
nearest_landmarks = find_top_landmarks_for_points(Landmark_df,13.757760305707595, 100.7940923)
pd.DataFrame(nearest_landmarks)

#Test_function
nearest_stations = find_nearest_stations(station_df,13.731800694225882, 100.5363750963248)
pd.DataFrame(nearest_stations)

def add_near_station_top3(newdata, station_data):
    # find nearest stations for all rows at once using list comprehension
    nearest_stations = [find_nearest_stations(station_data, row['geoLat'], row['geoLng'], num_stations=2) for _, row in newdata.iterrows()]

    # create a DataFrame from the output lists using list comprehension
    station_df = pd.DataFrame([{'station_1': station[0][0], 'station_2': station[1][0], 'distance_1': station[0][1], 'distance_2': station[1][1], 'line_1': station[0][2], 'line_2': station[1][2]} for station in nearest_stations], index=newdata.index)

    # concatenate the new DataFrame with the original DataFrame and return
    return pd.concat([newdata, station_df], axis=1)

def add_near_landmark_top3(newdata, landmark_data):
    # find nearest landmarks for all rows at once using list comprehension
    nearest_landmarks = [find_top_landmarks_for_points(landmark_data, row['geoLat'], row['geoLng']) for _, row in newdata.iterrows()]

    # create a DataFrame from the output lists using list comprehension
    landmark_df = pd.DataFrame([{'landmark_Hospital': landmark[0][0], 'landmark_Department store': landmark[1][0], 'landmark_University': landmark[2][0],'distance_Hospital': landmark[0][1], 'distance_Department store': landmark[1][1],'distance_University': landmark[2][1]} for landmark in nearest_landmarks], index=newdata.index)

    # concatenate the new DataFrame with the original DataFrame and return
    return pd.concat([newdata, landmark_df], axis=1)

# create a DataFrame with example data
TESTER = pd.DataFrame({
    
    'place': ['A', 'B', 'C', 'D', 'E'],
    'geoLat': [13.8353337, 13.7462463, 51.5033, 51.5113, 51.5077],
    'geoLng': [100.6661074, 100.5325515, -0.1195, -0.1199, -0.1226]
})

df = add_near_landmark_top3(TESTER,Landmark_df)
df = add_near_station_top3(df,station_df)
#df = find_top_landmarks_for_points(df,Landmark_df,('Hospital','Department store', 'University'))
df

df_proterty = pd.read_csv('ddproperty.csv')
df_proterty

print('sale_type',df_proterty['sub_property_type'].unique(),'\n')
print('property_type',df_proterty['property_type'].unique(),'\n')
print('state',df_proterty['state'].unique(),'\n')
print('land_space_unit',df_proterty['land_space_unit'].unique(),'\n')
print('currency',df_proterty['currency'].unique(),'\n')
print('country',df_proterty['country'].unique(),'\n')
print('property_status',df_proterty['property_status'].unique(),'\n')

#select interested column ฿84623.8938 / sqm
#df_proterty.info()
import re

df_proterty['price_per_unit'] = df_proterty.apply(lambda row: float(re.findall("฿\s*(\d+(?:\.\d+)?)\s*/", row['price_per_unit'])[0]), axis=1)


df = df_proterty[['sub_property_type','living_space','city','state','latitude','longitude','price_per_unit','property_type','built_year','last_updated']]
df = df.rename(columns={'latitude': 'geoLat', 'longitude': 'geoLng'})
#To check Null in df
df.isnull().sum()

df_without_na=df.dropna(subset  = ['geoLat', 'living_space', 'geoLng','built_year'])
df_without_na.info()
df_proterty

df_Bangkok_Metropolitan_Region = df_without_na.loc[df_without_na['state'].isin(['Bangkok','Samut Prakan','Nonthaburi','Pathum Thani','Samut Sakhon'])]

df_Bangkok_Metropolitan_Region.info()

"""df_Bangkok_Metropolitan_Region_withstation = add_near_station_top3(df_Bangkok_Metropolitan_Region,station_df)
df_Bangkok_Metropolitan_Region_withstation = add_near_landmark_top3(df_Bangkok_Metropolitan_Region_withstation,Landmark_df)
df_Bangkok_Metropolitan_Region_withstation['last_updated'] = pd.to_datetime(df_Bangkok_Metropolitan_Region_withstation['last_updated'])
"""

Hidden output

‌
‌
‌