Skip to content

NOTE: THIS PROJECT IS A WORK IN PROGRESS

During my first onternship at Levine Querido in 2019, I completed a project that investigated the anecdotal wisdom that many in publishing had told me: that the publishing month of a book can have a big impact on its chances of winning an award.

I collected data on the books honored by the Pura Belpré, Caldecott, Coretta Scott King, Newberry, Printz, and Sibert Awards from 2009 to 2019. With that data, I presented a series of histograms and a summary of my findings. Presented here is am updated version of that presentation, coded in python.

Importing and cleaning data

this process includes:

  • removing blank rows
  • converting the pub date column to datetime
  • on a few of the spreadsheets, citations were indicated by color coding. to those databases I added a citations column
import pandas as pd
belpre = pd.read_csv('./pubdate data.xlsx - belpre.csv')
belpre.drop([69, 70, 71], axis=0, inplace=True)
belpre['citation'] = ""  
winners = [('Elizabeth Acevedo', 2019),  ('Ruth Behar', 2018),  ('Juana Medina', 2017),  ('Margarita Engle', 2016), ('Marjorie Agosín', 2015), ('Meg Medina', 2014), ('Benjamin Alire Sáenz', 2013), ('Guadalupe Garcia McCall', 2012), ('Pam Muñoz Ryan', 2011), ('Julia Alvarez', 2010), ('Margarita Engle', 2009), ]
artwinners =[('Yuyi Morales', 2019), ('Juana Martinez-Neal', 2018), ('Raul Gonzalez', 2017), ('Rafael López', 2016), ('Yuyi Morales', 2015), ('Yuyi Morales', 2014), ('David Diaz', 2013), ('Duncan Tonatiuh', 2012), ('Eric Velasquez', 2011), ('Rafael López', 2010), ('Yuyi Morales', 2009)]
for index, row in belpre.iterrows():  
    author = row['Author']
    year = row['Year']
    illustrator = row['Illustrator']
    if (author, year) in winners:  
        belpre.at[index, 'citation'] = 'Winner'  
    elif (illustrator, year) in artwinners:
        belpre.at[index, 'citation'] = 'Winner'
    else:
        belpre.at[index, 'citation'] = 'Honor'  
belpre
import pandas as pd
caldecott = pd.read_csv('pubdate data.xlsx - caldecott.csv')
caldecott.drop(['Unnamed: 0'], axis=1, inplace=True)
caldecott.drop([51,52,53,54], axis=0, inplace=True)
caldecott['citation'] = ""
winners = [('Sophie Blackall', 2019),  ('Matthew Cordell ', 2018),  ('Javaka Steptoe ', 2017),  ('Lindsay Mattick /Sophie Blackall', 2016), ('Dan Santat', 2015), ('Brian Floca', 2014), ('Jon Klassen ', 2013), ('Chris Raschka', 2012), ('Philip C. Stead/Erin E. Stead', 2011), ('Jerry Pinkney', 2010), ('Susan Marie Swanson/Beth Krommes', 2009), ]
for index, row in caldecott.iterrows():  
    author = row['Author/Illustrator']
    year = row['Year']
    if (author, year) in winners:  
        caldecott.at[index, 'citation'] = 'Winner'  
    else:
        caldecott.at[index, 'citation'] = 'Honor' 
caldecott
import pandas as pd
csk = pd.read_csv('pubdate data.xlsx - csk.csv')
csk.drop([72, 73], axis=0, inplace=True)
csk['citation'] = ""
winners = [('Claire Hartfield', 2019),  ('Renée Watson', 2018),  ('John Lewis and Andrew Aydin', 2017),  ('Rita Williams-Garcia', 2016), ('Jacqueline Woodson', 2015), ('Rita Williams-Garcia', 2014), ('Andrea Davis Pinkney', 2013), ('Kadir Nelson', 2012), ('Rita Williams-Garcia', 2011), ('Vaunda Micheaux Nelson', 2010), ('Kadir Nelson', 2009), ('Ekua Holmes', 2019), ('Ekua Holmes', 2018), ('Javaka Steptoe', 2017), ('Bryan Collier', 2016), ('Christopher Myers', 2015), ('Bryan Collier', 2014), ('Bryan Collier', 2013), ('Shane W. Evans', 2012), ('Bryan Collier', 2011), ('Charles R. Smith Jr.', 2010), ('Floyd Cooper', 2009)]
for index, row in csk.iterrows():  
    person = row['Person']
    year = row['Year']
    if (person, year) in winners:  
        csk.at[index, 'citation'] = 'Winner'  
    else:
        csk.at[index, 'citation'] = 'Honor'  
csk
import pandas as pd
newberry = pd.read_csv('pubdate data.xlsx - newberry.csv')
newberry.drop(newberry.columns[[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]], axis=1, inplace=True)
newberry['citation'] = ""
winners = [('Meg Medina', 2019),  ('Erin Entrada Kelly', 2018),  ('Kelly Barnhill', 2017),  ('Matt de la Peña', 2016), ('Kwame Alexander ', 2015), ('Kate DiCamillo', 2014), (' Katherine Applegate', 2013), ('Jack Gantos ', 2012), ('Clare Vanderpool', 2011), ('Rebecca Stead ', 2010), ('Neil Gaiman/Dave McKean ', 2009), ]
for index, row in newberry.iterrows():  
    author = row['Author/Illustrator']
    year = row['Year']
    if (author, year) in winners:  
        newberry.at[index, 'citation'] = 'Winner'  
    else:
        newberry.at[index, 'citation'] = 'Honor'
newberry
import pandas as pd
printz = pd.read_csv('pubdate data.xlsx - printz.csv')
printz.drop([52,53], axis=0, inplace=True)
printz
import pandas as pd
sibert = pd.read_csv('pubdate data.xlsx - sibert.csv')
sibert.drop([51,52], axis=0, inplace=True)
sibert
dfs = (belpre, caldecott, csk, newberry, printz, sibert)

for df in dfs:
    print(df.info())
import pandas as pd

for df in dfs:
    df['Pub Date'] = pd.to_datetime(df['Pub Date'], errors='coerce', format='%m/%d/%Y')
    
for df in dfs:
    print(df.info())
    print(df['Pub Date'])
Hidden output
for df in dfs:
    df['Pub Month'] = df['Pub Date'].dt.month
    print(df['Pub Month'])
Hidden output
for df in dfs:
    print(df['Pub Month'].mode())

Visualizing Pub Month Popularity

The six histograms presented below count the amount of awarded books published in each month. The blue lines represent all books, while orange displays only the winners.

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2)
fig.subplots_adjust(wspace=0.5,
                    hspace=0.5)
titles = ['belpre', 'caldecott', 'csk', 'newberry', 'printz', 'sibert']
axes = [ax1, ax2, ax3, ax4, ax5, ax6]

ax1.hist(belpre['Pub Month'], bins=12, alpha=0.5)
ax1.set_xlim(1,12)
ax1.set_xticks([1,2,3,4,5,6,7,8,9,10,11,12])
ax1.set_yticks([1,3,6,9,12,15])
ax1.hist(belpre['Pub Month'].loc[belpre['citation'] == 'Winner'], color='orange')
ax1.set_title('belpre')

ax2.hist(caldecott['Pub Month'], bins=12, alpha=0.5)
ax2.set_xlim(1,12)
ax2.set_xticks([1,2,3,4,5,6,7,8,9,10,11,12])
ax2.set_yticks([1,3,6,9,12,15])
ax2.hist(caldecott['Pub Month'].loc[caldecott['citation'] == 'Winner'], color='orange')
ax2.set_title('caldecott')

ax3.hist(csk['Pub Month'], bins=12, alpha=0.5)
ax3.set_xlim(1,12)
ax3.set_xticks([1,2,3,4,5,6,7,8,9,10,11,12])
ax3.set_yticks([1,3,6,9,12,15])
ax3.hist(csk['Pub Month'].loc[csk['citation'] == 'Winner'], color='orange')
ax3.set_title('csk')

ax4.hist(newberry['Pub Month'], bins=12, alpha=0.5)
ax4.set_xlim(1,12)
ax4.set_xticks([1,2,3,4,5,6,7,8,9,10,11,12])
ax4.set_yticks([1,3,6,9,12,15])
ax4.hist(newberry['Pub Month'].loc[newberry['citation'] == 'Winner'], color='orange')
ax4.set_title('newberry')

ax5.hist(printz['Pub Month'], bins=12, alpha=0.5)
ax5.set_xlim(1,12)
ax5.set_xticks([1,2,3,4,5,6,7,8,9,10,11,12])
ax5.set_yticks([1,3,6,9,12,15])
ax5.hist(printz['Pub Month'].loc[printz['Citation'] == 'Winner'], color='orange')
ax5.set_title('printz')

ax6.hist(sibert['Pub Month'], bins=12, alpha=0.5)
ax6.set_xlim(1,12)
ax6.set_xticks([1,2,3,4,5,6,7,8,9,10,11,12])
ax6.set_yticks([1,3,6,9,12,15])
ax6.hist(sibert['Pub Month'].loc[sibert['Citation'] == 'Winner'], color='orange')
ax6.set_title('sibert')

plt.clf()