Skip to content
UPK SDA DECA 2022 - 2023 (lake only)
# Import pandas
import pandas as pd
# Create 3 dataframes using excel files that correspond to UPK data only, geauga data only and lake data
upk = pd.read_excel('DECA 2022 - 2023 (upk_only).xlsx', sheet_name = 'deca')
lake = pd.read_excel('Lake 2023 - 2024 (Processed).xlsx')lake.shape# Print a list of columns in the lake dataframe
lake.columns.tolist()# Drop the 'Data Description' column from lake dataframe
# lake = lake.drop(columns=['Data Description', 'Q1',
# 'Q2',
# 'Q3',
# 'Q4',
# 'Q5',
# 'Q6',
# 'Q7',
# 'Q8',
# 'Q9',
# 'Q10',
# 'Q11',
# 'Q12',
# 'Q13',
# 'Q14',
# 'Q15',
# 'Q16',
# 'Q17',
# 'Q18',
# 'Q19',
# 'Q20',
# 'Q21',
# 'Q22',
# 'Q23',
# 'Q24',
# 'Q25',
# 'Q26',
# 'Q27',
# 'Q28',
# 'Q29',
# 'Q30',
# 'Q31',
# 'Q32',
# 'Q33',
# 'Q34',
# 'Q35',
# 'Q36',
# 'Q37',
# 'Q38',
# 'Q39',
# 'Q40',
# 'Q41',
# 'Q42',
# 'Q43',
# 'Q44',
# 'Q45',
# 'Q46',
# 'Q47',
# 'Q48',
# 'Q49',
# 'Q50',
# 'Q51',
# 'Q52',
# 'Q53',
# 'Q54',
# 'Q55',
# 'Q56',
# 'Q57',
# 'Q58',
# 'Q59',
# 'Q60',
# 'Q61',
# 'Q62',
# 'Q63',
# 'Q64',
# 'Q65',
# 'Q66',
# 'Q67',
# 'Q68',
# 'Q69',
# 'Q70',
# 'Q71',
# 'Q72',
# 'UDF1',
# 'UDF2',
# 'UDF3',
# 'UDF4',
# 'UDF5',
# 'UDF6',
# 'UDF7',
# 'UDF8',
# 'UDF9',
# 'UDF10',
# 'UDF11',
# 'UDF12',
# 'UDF13',
# 'UDF14',
# 'UDF15'])# UPK represents Cuyahoga county
upk.shape# All Lake county data
lake.shapelake# Create a column index of an intersection colomns that the upk, geauga and lake dataframes have in common
common_columns = upk.columns.intersection(lake.columns)
# Use the common_columns to filter geauga and lake dataframes for only those collumns in common_columns
lake_columns = lake[common_columns]# Print the entries in common_columns
common_columns# Verify the new shape
# The new shape should have 82 columns as opposed to the 86 columns in the UPK dataframe
# We will add 4 columns in later stages of the notebook
lake_columns.shapelake_columns# Create a list of the rater types of interest. T = Teacher, P = Parent
rater_type = ['T', 'P']
# Create a list of the rating periods of interest. P = Pre, T = Post
rating_period = ['P', 'T']# Remove duplicate ratings for a rater_type within the same rating_period.
def remove_duplicate_ratings(df, rater_type, rating_period):
# Filter the dataframe for entries with specified RaterType and RatingPeriod
filtered_df = df[(df['RaterType'] == rater_type) & (df['RatingPeriod'] == rating_period)]
# Sort the entries by 'ChildId' and 'RatingDate' in descending order to have the latest entry first
sorted_df = filtered_df.sort_values(by=['ChildId', 'RatingDate'], ascending=[True, False])
# Drop duplicates to keep only the latest entry for each 'ChildId'
latest_entries_df = sorted_df.drop_duplicates(subset=['ChildId'], keep='first')
# Now, remove all old entries for specified RaterType and RatingPeriod from the original dataframe
cleaned_df = df.drop(df[(df['RaterType'] == rater_type) & (df['RatingPeriod'] == rating_period)].index)
# Append the latest entries back to the dataframe
updated_df = cleaned_df.append(latest_entries_df, ignore_index=True)
return updated_df
# Apply the remove_duplicate_ratings function to remove pre and post duplicate assessments for parents and teachers in geauga
for r_type in rater_type:
for r_period in rating_period:
lake = remove_duplicate_ratings(lake, r_type, r_period)# Check to see if a child has both Pre and Post ratings
def check_pre_and_post(child_id, rater_type, df):
# Check if there is an entry for the child with RaterType 'T' and RatingPeriod 'P'
pre_condition = ((df['ChildId'] == child_id) &
(df['RaterType'] == rater_type) &
(df['RatingPeriod'] == 'P')).any()
# Check if there is an entry for the child with RaterType 'T' and RatingPeriod 'T'
post_condition = ((df['ChildId'] == child_id) &
(df['RaterType'] == rater_type) &
(df['RatingPeriod'] == 'T')).any()
# Return 'yes' if both conditions are met, else return 'no'
return 'yes' if pre_condition and post_condition else 'no'