Skip to content
Data Analyst with python
# Start coding here... PANDAS
.head() returns the first few rows (the “head” of the DataFrame).
.info() shows information on each of the columns, such as the data type and number of missing values.
.shape returns the number of rows and columns of the DataFrame.
.describe() calculates a few summary statistics for each column.
.values: A two-dimensional NumPy array of values.
.columns: An index of columns: the column names.
.index: An index for the rows: either row numbers or row names.df.sort_values()
one column df.sort_values("breed")
multiple columns df.sort_values(["breed", "weight_kg"])
df.sort_values(["breed", "weight_kg"],ascending=FALSE)
homelessness_reg_fam = homelessness.sort_values(["region", "family_members"], ascending=[True, False])Aggrgating DataFrames
df['column'].mean()
.agg()
.cumsum() -> soma do primeiro, primeiro mais segundo....drop_duplicates(subset=['column','column'])
.value_counts(normalize=True) # Percentage
.mean()
.agg([min,max,sum])df.groupby('column')['column'].mean
df.pivot_table(values='column', index='column', aggfunc=np.meadian, fill_value=0, margins=True)
mean_sales_by_type_holiday = sales.pivot_table(values="weekly_sales", index="type", columns="is_holiday").columns
.set_index('name')
.reset_index(drop=True)
.sort_index()
temperatures_ind.sort_index(level=["country", "city"], ascending = [True, False]))
# Slicing lists
df[1:5]
.loc['column1':'column4'] #-> Doesn't work in index
.loc[('filter1', 'filter2'):('filter3','filter4')]
(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad"), "date":"avg_temp_c"])# pivot tables
df.pivot_table(values='column', index='column', aggfunc=np.meadian, fill_value=0, margins=True)
mean_sales_by_type_holiday = sales.pivot_table(values="weekly_sales", index="type", columns="is_holiday")
Visualizing your data
import matplotlib.pyplot as plt
df['column'].hist(bins=20, alpha=0.7)
plt.show()
.plot(x='colu', y='col', kind='plot / scatter')
#missing value
.isna().any().sum().plot()
.dropna()
.fillna(0)
#dictionaires
my_dict = {'key1': value1, 'key2': value2}
my_dict['key1']
.to_csv('sdf.csv')
Intermediate
plt.plot(xdf,ydf)
plt.scatter(xdf,ydf)
plt.xlabel
plt.ylabel
plt.title
plt.yticks
.index
# keys have to be immutable objects
# easily select data -> list ; looking for data must be fast -> Dictionary
.read_csv( df, index_col = 0)
.loc (label-based)
.iloc(integer position-based)
df.loc[['row1','row2'],['col1','col2']]
cars['cars_per_cap']
cars[['cars_per_cap']]
The single bracket version gives a Pandas Series, the double bracket version gives a Pandas DataFrame.