Skip to content
Competition - Bee friendly plants
Executive summary
Introduction
Exploratory Data Analysis
# imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt# load data
data = pd.read_csv("data/plants_and_bees.csv")Missing data
fig, ax = plt.subplots()
data.isna().sum().plot.barh(ax=ax)
ax.set_title("Missing data per column")Datetime
- 9 distinct days with measurements
- Most records are taken at 9am
# combine date (e.g.04/17/2017) and time (e.g. 935) to datatime (e.g. 2017-04-17 09:35)
data["datetime"] = pd.to_datetime(data.date + " " + data.time.astype("str").str.zfill(4))
#data = data.drop(columns=["date", "time"])
# plotting
fix, (ax_0, ax_1) = plt.subplots(1, 2, figsize=(12, 3), sharey=True)
data.datetime.dt.date.value_counts().sort_index().plot.bar(ax=ax_0)
ax_0.set_ylabel("Number of records")
ax_0.set_title("Number of records per day")
data.datetime.dt.hour.value_counts().sort_index().plot.bar(ax=ax_1)
ax_1.set_xlabel("Hour of day")
_ = ax_1.set_title("Number of records per hour of day")
fig, ax = plt.subplots()
sns.heatmap(
    data=(
        data
        #
        .assign(date=lambda x: x.datetime.dt.date, hour=lambda x: x.datetime.dt.hour)
        #
        .pivot_table(index="date", columns="hour", values="datetime", aggfunc="count")
        #
        .fillna(0)
    ),
    ax=ax
)
_ = ax.set_title("Number of records per day and hour of day")Number of observations per sample
fig, ax = plt.subplots(figsize=(8, 4))
data.sample_id.value_counts().hist(ax=ax)
ax.set_title("Number of samples vs number of observations per sample")
ax.set_ylabel("Number of samples")
_ = ax.set_xlabel("Number of observations per sample")Bee species
fig, ax = plt.subplots(figsize=(8, 4))
data.species_num.plot.hist(ax=ax)
ax.set_title("Number of samples vs number of different bee species")
ax.set_ylabel("Number of samples")
_ = ax.set_xlabel("Number of different bee species")Season
fig, ax = plt.subplots(figsize=(8, 4))
data.season.value_counts().plot.pie(ax=ax, autopct='%1.1f%%', explode=(0, 0.1))
ax.set_ylabel("")
_ = ax.set_title("Season during sample collection")