UPK SDA DECA 2022 - 2023 (lake only)

# Import pandas
import pandas as pd

# Create 3 dataframes using excel files that correspond to UPK data only, geauga data only and lake data
upk = pd.read_excel('DECA 2022 - 2023 (upk_only).xlsx', sheet_name = 'deca')
lake = pd.read_excel('Lake 2023 - 2024 (Processed).xlsx')

lake.shape

# Print a list of columns in the lake dataframe
lake.columns.tolist()

# Drop the 'Data Description' column from lake dataframe
# lake = lake.drop(columns=['Data Description', 'Q1',
#  'Q2',
#  'Q3',
#  'Q4',
#  'Q5',
#  'Q6',
#  'Q7',
#  'Q8',
#  'Q9',
#  'Q10',
#  'Q11',
#  'Q12',
#  'Q13',
#  'Q14',
#  'Q15',
#  'Q16',
#  'Q17',
#  'Q18',
#  'Q19',
#  'Q20',
#  'Q21',
#  'Q22',
#  'Q23',
#  'Q24',
#  'Q25',
#  'Q26',
#  'Q27',
#  'Q28',
#  'Q29',
#  'Q30',
#  'Q31',
#  'Q32',
#  'Q33',
#  'Q34',
#  'Q35',
#  'Q36',
#  'Q37',
#  'Q38',
#  'Q39',
#  'Q40',
#  'Q41',
#  'Q42',
#  'Q43',
#  'Q44',
#  'Q45',
#  'Q46',
#  'Q47',
#  'Q48',
#  'Q49',
#  'Q50',
#  'Q51',
#  'Q52',
#  'Q53',
#  'Q54',
#  'Q55',
#  'Q56',
#  'Q57',
#  'Q58',
#  'Q59',
#  'Q60',
#  'Q61',
#  'Q62',
#  'Q63',
#  'Q64',
#  'Q65',
#  'Q66',
#  'Q67',
#  'Q68',
#  'Q69',
#  'Q70',
#  'Q71',
#  'Q72',
#  'UDF1',
#  'UDF2',
#  'UDF3',
#  'UDF4',
#  'UDF5',
#  'UDF6',
#  'UDF7',
#  'UDF8',
#  'UDF9',
#  'UDF10',
#  'UDF11',
#  'UDF12',
#  'UDF13',
#  'UDF14',
#  'UDF15'])

# UPK represents Cuyahoga county
upk.shape

# All  Lake county data
lake.shape

lake

# Create a column index of an intersection colomns that the upk, geauga and lake dataframes have in common
common_columns = upk.columns.intersection(lake.columns)

# Use the common_columns to filter geauga and lake dataframes for only those collumns in common_columns
lake_columns = lake[common_columns]

# Print the entries in common_columns
common_columns

# Verify the new shape
# The new shape should have 82 columns as opposed to the 86 columns in the UPK dataframe
# We will add 4 columns in later stages of the notebook
lake_columns.shape

lake_columns

# Create a list of the rater types of interest. T = Teacher, P = Parent
rater_type = ['T', 'P']

# Create a list of the rating periods of interest. P = Pre, T = Post
rating_period = ['P', 'T']

# Remove duplicate ratings for a rater_type within the same rating_period.
def remove_duplicate_ratings(df, rater_type, rating_period):
    # Filter the dataframe for entries with specified RaterType and RatingPeriod
    filtered_df = df[(df['RaterType'] == rater_type) & (df['RatingPeriod'] == rating_period)]

    # Sort the entries by 'ChildId' and 'RatingDate' in descending order to have the latest entry first
    sorted_df = filtered_df.sort_values(by=['ChildId', 'RatingDate'], ascending=[True, False])

    # Drop duplicates to keep only the latest entry for each 'ChildId'
    latest_entries_df = sorted_df.drop_duplicates(subset=['ChildId'], keep='first')

    # Now, remove all old entries for specified RaterType and RatingPeriod from the original dataframe
    cleaned_df = df.drop(df[(df['RaterType'] == rater_type) & (df['RatingPeriod'] == rating_period)].index)

    # Append the latest entries back to the dataframe
    updated_df = cleaned_df.append(latest_entries_df, ignore_index=True)

    return updated_df

# Apply the remove_duplicate_ratings function to remove pre and post duplicate assessments for parents and teachers in geauga
for r_type in rater_type:
    for r_period in rating_period:
        lake = remove_duplicate_ratings(lake, r_type, r_period)

# Check to see if a child has both Pre and Post ratings
def check_pre_and_post(child_id, rater_type, df):
    # Check if there is an entry for the child with RaterType 'T' and RatingPeriod 'P'
    pre_condition = ((df['ChildId'] == child_id) & 
                     (df['RaterType'] == rater_type) & 
                     (df['RatingPeriod'] == 'P')).any()
    # Check if there is an entry for the child with RaterType 'T' and RatingPeriod 'T'
    post_condition = ((df['ChildId'] == child_id) & 
                      (df['RaterType'] == rater_type) & 
                      (df['RatingPeriod'] == 'T')).any()
    # Return 'yes' if both conditions are met, else return 'no'
    return 'yes' if pre_condition and post_condition else 'no'

‌
‌
‌