Skip to content
Predict model : condp price in bkk
import pandas as pdstation_df = pd.read_excel('station_df.xlsx')
station_dfLandmark_df = pd.read_excel('Landmark_df.xlsx')
Landmark_dffrom math import radians, cos, sin, asin, sqrt
def haversine(geoLng1, geoLat1, geoLng2, geoLat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
geoLng1, geoLat1, geoLng2, geoLat2 = map(radians, [geoLng1, geoLat1, geoLng2, geoLat2])
# haversine formula
dLng = geoLng2 - geoLng1
dLat = geoLat2 - geoLat1
a = sin(dLat/2)**2 + cos(geoLat1) * cos(geoLat2) * sin(dLng/2)**2
c = 2 * asin(sqrt(a))
r = 6371 # Radius of earth in kilometers
return c * r
def find_nearest_stations(station_df, geoLat, geoLng, num_stations=3):
"""
Find the top num_stations nearest train stations to the given latitude and longitude,
with different line names from each other, along with their distances in kilometers.
"""
station_data = station_df.copy()
# calculate the distances from the input coordinates to all stations
station_data['distance_km'] = station_data.apply(lambda row: haversine(geoLng, geoLat, row['geoLng'], row['geoLat']), axis=1)
# replace stations that are more than 10 km away with None values
station_data.loc[station_data['distance_km'] > 8, ['name', 'distance_km', 'lineName']] = [None, None, None]
# sort the stations by distance
nearest_stations = station_data.sort_values(by='distance_km')
# create an empty list to store the output stations
output_stations = []
# loop through the nearest stations and add the top num_stations that are not on the same line as each other
for _, station in nearest_stations.iterrows():
if len(output_stations) >= num_stations:
break
if station['name'] is not None and all([station['lineName'] != s[2] for s in output_stations]):
output_stations.append((station['name'], station['distance_km'], station['lineName']))
# replace missing stations with None values
while len(output_stations) < num_stations:
output_stations.append((None, None, None))
return output_stations
def find_top_landmarks_for_points(landmark_df, geoLat, geoLng):
"""
Find the top num_landmarks nearest train landmarks to the given latitude and longitude,
with different line names from each other, along with their distances in kilometers.
"""
landmark_data = landmark_df.copy()
num_unique_landmark=landmark_data['Type land mark'].nunique()
num_top_landmarks=1
# calculate the distances from the input coordinates to all landmarks
landmark_data['distance_km'] = landmark_data.apply(lambda row: haversine(geoLng, geoLat, row['geoLng'], row['geoLat']), axis=1)
# replace landmarks that are more than 10 km away with None values
landmark_data.loc[landmark_data['distance_km'] > 8, ['Full Address', 'distance_km', 'Type land mark']] = [None, None, None]
# sort the landmarks by distance
sort_landmarks = landmark_data.sort_values(by='distance_km')
# create an empty list to store the output landmarks
output_landmarks = []
#('Hospital','Department store', 'University')
i=0
nearest_Hospital = sort_landmarks.loc[landmark_data['Type land mark']=='Hospital']
if nearest_Hospital.empty:
output_landmarks.append((None, None, None))
else :
for _, landmark in nearest_Hospital.iterrows():
if i >= num_top_landmarks:
break
output_landmarks.append((landmark['Full Address'], landmark['distance_km'], landmark['Type land mark']))
i = i+1
i=0
nearest_Department_store = sort_landmarks.loc[landmark_data['Type land mark']=='Department store']
if nearest_Department_store.empty:
output_landmarks.append((None, None, None))
else :
for _, landmark in nearest_Department_store.iterrows():
if i >= num_top_landmarks:
break
output_landmarks.append((landmark['Full Address'], landmark['distance_km'], landmark['Type land mark']))
i = i+1
i=0
nearest_University = sort_landmarks.loc[landmark_data['Type land mark']=='University']
if nearest_University.empty:
output_landmarks.append((None, None, None))
else :
for _, landmark in nearest_University.iterrows():
if i >= num_top_landmarks:
break
output_landmarks.append((landmark['Full Address'], landmark['distance_km'], landmark['Type land mark']))
i = i+1
return output_landmarks
#Test_function
nearest_landmarks = find_top_landmarks_for_points(Landmark_df,13.757760305707595, 100.7940923)
pd.DataFrame(nearest_landmarks)#Test_function
nearest_stations = find_nearest_stations(station_df,13.731800694225882, 100.5363750963248)
pd.DataFrame(nearest_stations)
def add_near_station_top3(newdata, station_data):
# find nearest stations for all rows at once using list comprehension
nearest_stations = [find_nearest_stations(station_data, row['geoLat'], row['geoLng'], num_stations=2) for _, row in newdata.iterrows()]
# create a DataFrame from the output lists using list comprehension
station_df = pd.DataFrame([{'station_1': station[0][0], 'station_2': station[1][0], 'distance_1': station[0][1], 'distance_2': station[1][1], 'line_1': station[0][2], 'line_2': station[1][2]} for station in nearest_stations], index=newdata.index)
# concatenate the new DataFrame with the original DataFrame and return
return pd.concat([newdata, station_df], axis=1)
def add_near_landmark_top3(newdata, landmark_data):
# find nearest landmarks for all rows at once using list comprehension
nearest_landmarks = [find_top_landmarks_for_points(landmark_data, row['geoLat'], row['geoLng']) for _, row in newdata.iterrows()]
# create a DataFrame from the output lists using list comprehension
landmark_df = pd.DataFrame([{'landmark_Hospital': landmark[0][0], 'landmark_Department store': landmark[1][0], 'landmark_University': landmark[2][0],'distance_Hospital': landmark[0][1], 'distance_Department store': landmark[1][1],'distance_University': landmark[2][1]} for landmark in nearest_landmarks], index=newdata.index)
# concatenate the new DataFrame with the original DataFrame and return
return pd.concat([newdata, landmark_df], axis=1)# create a DataFrame with example data
TESTER = pd.DataFrame({
'place': ['A', 'B', 'C', 'D', 'E'],
'geoLat': [13.8353337, 13.7462463, 51.5033, 51.5113, 51.5077],
'geoLng': [100.6661074, 100.5325515, -0.1195, -0.1199, -0.1226]
})
df = add_near_landmark_top3(TESTER,Landmark_df)
df = add_near_station_top3(df,station_df)
#df = find_top_landmarks_for_points(df,Landmark_df,('Hospital','Department store', 'University'))
dfdf_proterty = pd.read_csv('ddproperty.csv')
df_protertyprint('sale_type',df_proterty['sub_property_type'].unique(),'\n')
print('property_type',df_proterty['property_type'].unique(),'\n')
print('state',df_proterty['state'].unique(),'\n')
print('land_space_unit',df_proterty['land_space_unit'].unique(),'\n')
print('currency',df_proterty['currency'].unique(),'\n')
print('country',df_proterty['country'].unique(),'\n')
print('property_status',df_proterty['property_status'].unique(),'\n')#select interested column ฿84623.8938 / sqm
#df_proterty.info()
import re
df_proterty['price_per_unit'] = df_proterty.apply(lambda row: float(re.findall("฿\s*(\d+(?:\.\d+)?)\s*/", row['price_per_unit'])[0]), axis=1)
df = df_proterty[['sub_property_type','living_space','city','state','latitude','longitude','price_per_unit','property_type','built_year','last_updated']]
df = df.rename(columns={'latitude': 'geoLat', 'longitude': 'geoLng'})
#To check Null in df
df.isnull().sum()
df_without_na=df.dropna(subset = ['geoLat', 'living_space', 'geoLng','built_year'])
df_without_na.info()
df_proterty
df_Bangkok_Metropolitan_Region = df_without_na.loc[df_without_na['state'].isin(['Bangkok','Samut Prakan','Nonthaburi','Pathum Thani','Samut Sakhon'])]df_Bangkok_Metropolitan_Region.info()"""df_Bangkok_Metropolitan_Region_withstation = add_near_station_top3(df_Bangkok_Metropolitan_Region,station_df)
df_Bangkok_Metropolitan_Region_withstation = add_near_landmark_top3(df_Bangkok_Metropolitan_Region_withstation,Landmark_df)
df_Bangkok_Metropolitan_Region_withstation['last_updated'] = pd.to_datetime(df_Bangkok_Metropolitan_Region_withstation['last_updated'])
"""Hidden output