Skip to content
%%capture
!pip install gspread
import pandas as pd
import gspread
from google.oauth2 import service_account
import json
import os
import os
import json
import gspread
from google.oauth2 import service_account

google_json = os.getenv("GOOGLE_JSON")
service_account_info = json.loads(google_json)
credentials = service_account.Credentials.from_service_account_info(service_account_info)
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
creds_with_scope = credentials.with_scopes(scope)
client = gspread.authorize(creds_with_scope)
spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/1RwmKWMFtBtXXiwNUve074jPdePWyrwZ-fRuAiIHE3Hc/edit#gid=1666100842')
worksheet = spreadsheet.get_worksheet(0)
records_data = worksheet.get_all_records()
records_df = pd.DataFrame.from_dict(records_data)
records_df

1.1 How many rows are in the dataset?

records_df.shape[0]

1.2 What fraction of trees are oak or elm? Elms and oaks are those trees where their Species Common Name, the "SPP_COM" column, contains either "elm" or "oak", ignoring case. Exclude from that analysis those trees that are in column Species Common Name listed as "other, unknown", "various", "stump", " " (white space), and "repl to be determined", once again, ignoring case.

contains_elm = records_df["SPP_COM"].str.lower().str.contains("elm")
contains_oak = records_df["SPP_COM"].str.lower().str.contains("oak")

filtered_elm_df = records_df[contains_elm]
filtered_oak_df = records_df[contains_oak]
elm_oak_df = filtered_elm_df.append(filtered_oak_df, ignore_index=True)
excluded_df = records_df[~(records_df["SPP_COM"].str.lower().str.contains('other, unknown')) & ~(records_df["SPP_COM"].str.lower().str.contains('various')) & ~(records_df["SPP_COM"].str.lower().str.contains('stump')) & ~(records_df["SPP_COM"].str.lower().str.contains('repl to be determined')) & ~(records_df["SPP_COM"].str.lower() == ' ')]
print(elm_oak_df.shape[0], " / ", excluded_df.shape[0])

1.3 How does growth space size vary across tree species? You'll notice the growth space size column, "GSSIZE", is really a range. E.g., "4' - 5'". For entries that are ranges, use the middle point of the lower and upper range, e.g., 4.5 feet for the previous example. If the entry only has a lower range, just use the value of the lower range. Ignore trees where the growth space size entry does not represent a number. For each species, using the Species Common Name, calculate the average growth space size. Report the standard deviation of these averages. Keep the measurement in feet. Exclude species with less than 50 trees from the analysis.

import pandas as pd

def extract_num(inp_str):
    num = []
    for c in inp_str:
        if c.isdigit():
            num.append(float(c)) 
    return num

gss_df = records_df[['SPP_COM', 'GSSIZE']]
gss_num_df = pd.DataFrame(columns = ['SPP_COM', 'GSSIZE', 'GSS_LOW', "GSS_HIGH", "GSS_AVE"])

for ind in gss_df.index:
    gss_numbers = extract_num(gss_df['GSSIZE'][ind]) 
    if len(gss_numbers) == 1:
        gss_num_df.loc[len(gss_num_df.index)] = [gss_df['SPP_COM'][ind], gss_df['GSSIZE'][ind], gss_numbers[0], "", gss_numbers[0]] 
       
    
    elif len(gss_numbers) == 2:
        gss_num_df.loc[len(gss_num_df.index)] = [gss_df['SPP_COM'][ind], gss_df['GSSIZE'][ind], gss_numbers[0], gss_numbers[1], (gss_numbers[0]+gss_numbers[1]) / 2] 
        
filtered = gss_num_df.groupby(['SPP_COM']).filter(lambda x: len(x) > 49)

mean = filtered.groupby(['SPP_COM'])['GSS_AVE'].mean()
stddev = filtered.groupby(['SPP_COM'])['GSS_AVE'].std()

print(mean)
print(stddev)

1.4 What is the median distance from each Norway Maple to its closest Red Maple? The longitude and latitude as decimal degrees for each tree is stored in the "X" and "Y" column, respectively. For calculating distances, please use the spherical Earth projected to a plane equation. Use the radius of the Earth as 6371 km. For conversion between kilometers and miles, use that one mile is equal to 1.60934 kilometers. Report your answer in terms of feet.