Skip to content
Data Incubator
%%capture
!pip install gspread
import pandas as pd
import gspread
from google.oauth2 import service_account
import json
import os
import os
import json
import gspread
from google.oauth2 import service_account
google_json = os.getenv("GOOGLE_JSON")
service_account_info = json.loads(google_json)
credentials = service_account.Credentials.from_service_account_info(service_account_info)
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
creds_with_scope = credentials.with_scopes(scope)
client = gspread.authorize(creds_with_scope)
spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/1RwmKWMFtBtXXiwNUve074jPdePWyrwZ-fRuAiIHE3Hc/edit#gid=1666100842')
worksheet = spreadsheet.get_worksheet(0)
records_data = worksheet.get_all_records()
records_df = pd.DataFrame.from_dict(records_data)
records_df
1.1 How many rows are in the dataset?
records_df.shape[0]
1.2 What fraction of trees are oak or elm? Elms and oaks are those trees where their Species Common Name, the "SPP_COM" column, contains either "elm" or "oak", ignoring case. Exclude from that analysis those trees that are in column Species Common Name listed as "other, unknown", "various", "stump", " " (white space), and "repl to be determined", once again, ignoring case.
contains_elm = records_df["SPP_COM"].str.lower().str.contains("elm")
contains_oak = records_df["SPP_COM"].str.lower().str.contains("oak")
filtered_elm_df = records_df[contains_elm]
filtered_oak_df = records_df[contains_oak]
elm_oak_df = filtered_elm_df.append(filtered_oak_df, ignore_index=True)
excluded_df = records_df[~(records_df["SPP_COM"].str.lower().str.contains('other, unknown')) & ~(records_df["SPP_COM"].str.lower().str.contains('various')) & ~(records_df["SPP_COM"].str.lower().str.contains('stump')) & ~(records_df["SPP_COM"].str.lower().str.contains('repl to be determined')) & ~(records_df["SPP_COM"].str.lower() == ' ')]
print(elm_oak_df.shape[0], " / ", excluded_df.shape[0])
1.3 How does growth space size vary across tree species? You'll notice the growth space size column, "GSSIZE", is really a range. E.g., "4' - 5'". For entries that are ranges, use the middle point of the lower and upper range, e.g., 4.5 feet for the previous example. If the entry only has a lower range, just use the value of the lower range. Ignore trees where the growth space size entry does not represent a number. For each species, using the Species Common Name, calculate the average growth space size. Report the standard deviation of these averages. Keep the measurement in feet. Exclude species with less than 50 trees from the analysis.
import pandas as pd
def extract_num(inp_str):
num = []
for c in inp_str:
if c.isdigit():
num.append(float(c))
return num
gss_df = records_df[['SPP_COM', 'GSSIZE']]
gss_num_df = pd.DataFrame(columns = ['SPP_COM', 'GSSIZE', 'GSS_LOW', "GSS_HIGH", "GSS_AVE"])
for ind in gss_df.index:
gss_numbers = extract_num(gss_df['GSSIZE'][ind])
if len(gss_numbers) == 1:
gss_num_df.loc[len(gss_num_df.index)] = [gss_df['SPP_COM'][ind], gss_df['GSSIZE'][ind], gss_numbers[0], "", gss_numbers[0]]
elif len(gss_numbers) == 2:
gss_num_df.loc[len(gss_num_df.index)] = [gss_df['SPP_COM'][ind], gss_df['GSSIZE'][ind], gss_numbers[0], gss_numbers[1], (gss_numbers[0]+gss_numbers[1]) / 2]
filtered = gss_num_df.groupby(['SPP_COM']).filter(lambda x: len(x) > 49)
mean = filtered.groupby(['SPP_COM'])['GSS_AVE'].mean()
stddev = filtered.groupby(['SPP_COM'])['GSS_AVE'].std()
print(mean)
print(stddev)