Skip to content
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib as plt
sm=pd.read_csv("Sample - Superstore.csv", encoding='latin1')
sm.head()
sm.info()
sm['Order Date']=pd.to_datetime(sm['Order Date'], format='%m/%d/%Y')
sm['Ship Date']=pd.to_datetime(sm['Ship Date'], format='%m/%d/%Y')
sm['OrderY']=sm['Order Date'].dt.year
sm['OrderM']=sm['Order Date'].dt.month
sm['OrderD']=sm['Order Date'].dt.day
sm['Profitability']=sm['Profit']/sm['Sales']
sm.describe()
sm.describe(include="object")
sm.isna().sum().sum()
sm.duplicated().sum()
sm.head()
ocm=sm.groupby("OrderM")["Order ID"].count().reset_index()
ocm.columns=["OrderM","Count"]
px.line(ocm,x="OrderM",y="Count",markers=True)
ocym=sm.groupby(["OrderY","OrderM"])["Order ID"].count().reset_index()
ocym.columns=["OrderY","OrderM","Order ID"]
ocym["Date"] = pd.to_datetime(ocym["OrderY"].astype(str) + "-" + ocym["OrderM"].astype(str).str.zfill(2))
px.line(ocym,x="Date",y="Order ID",markers=True,title="Orders Over Time")
px.histogram(sm,x="Segment",color="Region")