Skip to content

Streamlined Data Ingestion with pandas

Importing Data from Flat Files

# Importing the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
from sqlalchemy import create_engine

TIP:

If the file is TSV, then we could load TSV using the sep keyword argument to set delimiter: data = pd.read_csv('vt_tax_data_2016.tsv', sep = '\t')

# Reading the CSV and assigning it to the variable data
tax_data = pd.read_csv('https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv')

# Viewing the first few lines of data
print(tax_data.head(5))
print(tax_data.shape)
# Plotting the total number of tax returns by income group
counts = tax_data.groupby("agi_stub").N1.sum()
counts.plot.bar()
plt.show()

Import a subset of columns


# Creating a list of columns to use: zipcode, agi_stub (income group), mars1 (number of single households), MARS2 (number of households filing as married), and NUMDEP (number of dependents).
cols = ['zipcode', 'agi_stub', 'mars1', 'MARS2', 'NUMDEP']

# Creating dataframe from csv using only selected columns
data1 = pd.read_csv("https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv", usecols = cols)

# Viewomg counts of dependents and tax returns by income level
print(data1.groupby("agi_stub").sum())

Import a file in chunks

# Creating dataframe of next 500 rows with labeled columns
vt_data_first500 = pd.read_csv('https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv', nrows=500)

vt_data_next500 = pd.read_csv("https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv", 
                       		  nrows=500,
                       		  skiprows=500,
                       		  header = None,
                       		  names = list(vt_data_first500))

# Viewing the Vermont dataframes to confirm they're different
print(vt_data_first500.head())
print(vt_data_next500.head())

Handling error and missing data

print(data1.dtypes)

Zipcode is a string, and agi_stin is income group also a category. Making them into objects.

# Creating dict specifying data types for agi_stub and zipcode.
data_types = dtype={'agi_stub': object, 'zipcode': str}
			  

# Loading csv using dtype to set correct data types
data = pd.read_csv('https://assets.datacamp.com/production/repositories/4412/datasets/61bb27bf939aac4344d4f446ce6da1d1bf534174/vt_tax_data_2016.csv', dtype=data_types)

# Printing data types of resulting frame
print(data.dtypes.head())

Customizing Missing DataValues