Skip to content
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_is_fitted
df = pd.read_csv("Salary Data.csv")
df.dropna(inplace=True)
jt = list(df["Job Title"])
job_dict = {}

for job in jt:
    if job in job_dict.keys():
        job_dict[job] += 1
    else:
        job_dict[job] = 1|
df[df["Job Title"] == "Director of Marketing"]
plt.scatter(x=df["Age"], y=df["Salary"])
plt.xlabel("Employee Age")
plt.ylabel("Salary")
plt.title("Salary vs. Age");
# Splitting Data
# Training Data and Testing Data

# 1. Feature Matrix
features = ["Age"]
X_train = df[features]
X_train.shape
target = "Salary"
y_train = df[target]
y_train.shape
# Getting a baseline for our model
y_mean = y_train.mean()
y_mean
y_prediction_baseline = [y_mean] * len(y_train)
y_prediction_baseline[:2]
plt.plot(X_train["Age"], y_prediction_baseline, color="orange", label="Model Baseline")
plt.scatter(x=df["Age"], y=df["Salary"])
plt.xlabel("Employee Age")
plt.ylabel("Salary")
plt.title("Salary vs. Age");
# Getting the mean absolute error
mae = mean_absolute_error(y_train, y_prediction_baseline)
mae
# Creating the model
model = LinearRegression()
isinstance(model, LinearRegression)
# Fitting/training the model
model.fit(X_train, y_train)