import pandas as pd
import numpy as np
import scipy as sci
import matplotlib.pyplot as plt
import datetime as dt
import math

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LearningCurveDisplay, learning_curve

from tqdm import tqdm
from sklearn.metrics import mean_squared_error, r2_score
from scipy.ndimage import gaussian_filter1d 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv("Walmart_sales.csv")
df.head()

df["Date"] = pd.to_datetime(df['Date'], format="%d-%m-%Y")

print(df.dtypes)
df.head()

Store                    int64
Date            datetime64[ns]
Weekly_Sales           float64
Holiday_Flag             int64
Temperature            float64
Fuel_Price             float64
CPI                    float64
Unemployment           float64
dtype: object

numStores = len(df["Store"].unique())
print("Number of Stores: " + str(numStores))

Number of Stores: 45

print(df[df["Holiday_Flag"] == 1].groupby("Store")["Holiday_Flag"].count().hist())

Axes(0.125,0.11;0.775x0.77)

# Average sales per store differs 
df.groupby("Store")["Weekly_Sales"].mean().hist()

<Axes: >

df.groupby("Store")["Temperature"].mean().hist()

<Axes: >

df.groupby("Store")["CPI"].mean().hist()

<Axes: >

df.groupby("Store")["Unemployment"].mean().hist()

<Axes: >

df.groupby("Date")["Date"].count().hist()
numUnique = len(df["Date"].unique())
print("The number of unique dates are: " + str(numUnique))

The number of unique dates are: 143

df.groupby(df["Date"].dt.month)["Date"].count().plot(kind="bar")

<Axes: xlabel='Date'>

df.groupby(df["Date"].dt.year)["Date"].count().plot(kind="bar")

<Axes: xlabel='Date'>

df["Weekly_Sales"].hist()
mean = df["Weekly_Sales"].mean()
stdDev = df["Weekly_Sales"].std()

print("Mean: " + str(mean))
print("Std Dev: " + str(stdDev))

Mean: 1046964.8775617715
Std Dev: 564366.6220536975

# Define the bin edges to align with tick marks
bins = [-0.25, 0.25, 0.75, 1.25]

non_holiday = df[df['Holiday_Flag'] == 0]
holiday = df[df['Holiday_Flag'] == 1]

plt.hist(non_holiday['Holiday_Flag'], bins=bins, color='blue', label='Non-Holiday', alpha=0.5)
plt.hist(holiday['Holiday_Flag'], bins=bins, color='green', label='Holiday', alpha=0.5)

plt.xlabel('Holiday Flag')
plt.ylabel('Frequency')
plt.title('Distribution of Holiday Flag')
plt.legend()

# Customizing x-axis ticks and labels
plt.xticks([0, 0.5, 1], ['Non-Holiday', '', 'Holiday'])

plt.show()

print(df["Temperature"].describe())
df["Temperature"].hist()

count    6435.000000
mean       60.663782
std        18.444933
min        -2.060000
25%        47.460000
50%        62.670000
75%        74.940000
max       100.140000
Name: Temperature, dtype: float64

<Axes: >

# View histogram for fuel price
df['Fuel_Price'].hist()

<Axes: >

# View summary statistics
mean = df['Fuel_Price'].mean()
stdDev = df['Fuel_Price'].std()

print(f"Mean: {mean}")
print(f"Standard Devation: {stdDev}")

Mean: 3.358606837606838
Standard Devation: 0.4590197071928525

# View histogram
df['CPI'].hist()

<Axes: >

# Histogram for store 1
df[df['Store'] == 1]['CPI'].hist()

<Axes: >

frame = df[df['Store'] == 1]['CPI']
print(f"Range: [{frame.min()}, {frame.max()}]")

Range: [210.3374261, 223.4442513]

# View histogram of cpi for store 2
df[df['Store'] == 2]['CPI'].hist()

<Axes: >

frame = df[df['Store'] == 2]['CPI']
print(f"Range: [{frame.min()}, {frame.max()}]")

Range: [209.9984585, 223.0783366]

# View histogram of CPI's for store 3
df[df['Store'] == 3]['CPI'].hist()

<Axes: >

frame = df[df['Store'] == 3]['CPI']
print(f"Range: [{frame.min()}, {frame.max()}]")

Range: [213.6196139, 226.9873637]

# View histogram of CPI values for store 4
df[df['Store'] == 4]['CPI'].hist()

<Axes: >

frame = df[df['Store'] == 4]['CPI']
print(f"Range: [{frame.min()}, {frame.max()}]")

Range: [126.064, 131.1930968]

df['Unemployment'].hist()

print("Avg unemployment rate: " + str(df["Unemployment"].mean()))

Avg unemployment rate: 7.99915104895105

store_avgs = df.groupby("Store").mean("Temperature")

temp = store_avgs["Temperature"]
sales = store_avgs["Weekly_Sales"]

ts = np.vstack([temp,sales])
z = sci.stats.gaussian_kde(ts)(ts)

fig, plot = plt.subplots()

plt.title("Temperature vs. Weekly Sales")
plt.xlabel("Average Temperature")
plt.ylabel("Weekly Sales (in million USD)")

plot.scatter(temp, sales, c=z, s=100)

plt.show()

result = sci.stats.pearsonr(store_avgs["Temperature"], store_avgs["Weekly_Sales"], alternative="two-sided")

print(result)

PearsonRResult(statistic=-0.07638774908646385, pvalue=0.6179663423198961)

store_avgs = df.groupby("Store").mean("Fuel_Price")

temp = store_avgs["Fuel_Price"]
sales = store_avgs["Weekly_Sales"]

ts = np.vstack([temp,sales])
z = sci.stats.gaussian_kde(ts)(ts)

fig, plot = plt.subplots()

plt.title("Fuel Price vs. Weekly Sales")
plt.xlabel("Average Fuel Price")
plt.ylabel("Weekly Sales (in million USD)")

plot.scatter(temp, sales, c=z, s=100)

plt.show()

result = sci.stats.pearsonr(store_avgs["Fuel_Price"], store_avgs["Weekly_Sales"], alternative="two-sided")

print(result)

PearsonRResult(statistic=0.06773422508957694, pvalue=0.6584189067151822)

store_avgs = df.groupby("Store").mean("CPI")

temp = store_avgs["CPI"]
sales = store_avgs["Weekly_Sales"]

ts = np.vstack([temp,sales])
z = sci.stats.gaussian_kde(ts)(ts)

fig, plot = plt.subplots()

plt.title("CPI vs. Weekly Sales")
plt.xlabel("CPI Temperature")
plt.ylabel("Weekly Sales (in million USD)")

plot.scatter(temp, sales, c=z, s=100)

plt.show()

result = sci.stats.pearsonr(store_avgs["CPI"], store_avgs["Weekly_Sales"], alternative="two-sided")

print(result)

PearsonRResult(statistic=-0.07656886071136423, pvalue=0.6171309899200195)

temp = store_avgs["Unemployment"]
sales = store_avgs["Weekly_Sales"]

ts = np.vstack([temp,sales])
z = sci.stats.gaussian_kde(ts)(ts)

fig, plot = plt.subplots()

plt.title("Unemployment vs. Weekly Sales")
plt.xlabel("Average Unemployment Rate")
plt.ylabel("Weekly Sales (in million USD)")

plot.scatter(temp, sales, c=z, s=100)

plt.show()

result = sci.stats.pearsonr(store_avgs["Unemployment"], store_avgs["Weekly_Sales"], alternative="two-sided")

print(result)

PearsonRResult(statistic=-0.11228079769921509, pvalue=0.4627451284572275)

# Store 3 with non-Holiday's
non_holiday_1 = df[(df['Store'] == 1) & (df['Holiday_Flag'] == 0)].iloc[:140]

# Store 4 with holidays
holiday_1 = df[(df['Store'] == 1) & (df['Holiday_Flag'] == 1)].iloc[:140]


# Plot sales data for Store 1 during non-holiday weeks
plt.plot(non_holiday_1.index, non_holiday_1['Weekly_Sales'], color='blue', label='Non-Holiday Weeks')

# Plot sales data for Store 1 during holiday weeks
plt.plot(holiday_1.index, holiday_1['Weekly_Sales'], color='red', label='Holiday Weeks')

plt.xlabel('Week')
plt.ylabel('Weekly Sales (In Millions)')
plt.title('Weekly Sales for Store 1')
plt.legend()
plt.grid(True)  
plt.tight_layout()
plt.show()

# Store 3 with non-Holiday's
non_holiday_2 = df[(df['Store'] == 2) & (df['Holiday_Flag'] == 0)].iloc[:140]

# Store 4 with holidays
holiday_2 = df[(df['Store'] == 2) & (df['Holiday_Flag'] == 1)].iloc[:140]


# Plot sales data for Store 1 during non-holiday weeks
plt.plot(non_holiday_2.index, non_holiday_2['Weekly_Sales'], color='blue', label='Non-Holiday Weeks')

# Plot sales data for Store 1 during holiday weeks
plt.plot(holiday_2.index, holiday_2['Weekly_Sales'], color='red', label='Holiday Weeks')

plt.xlabel('Week')
plt.ylabel('Weekly Sales (In Millions) ')
plt.title('Weekly Sales for Store 2')
plt.legend()
plt.grid(True)  
plt.tight_layout()
plt.show()

# Store 3 with non-Holiday's
non_holiday_3 = df[(df['Store'] == 3) & (df['Holiday_Flag'] == 0)].iloc[:140]

# Store 4 with holidays
holiday_3 = df[(df['Store'] == 3) & (df['Holiday_Flag'] == 1)].iloc[:140]


# Plot sales data for Store 1 during non-holiday weeks
plt.plot(non_holiday_3.index, non_holiday_3['Weekly_Sales'], color='blue', label='Non-Holiday Weeks')

# Plot sales data for Store 1 during holiday weeks
plt.plot(holiday_3.index, holiday_3['Weekly_Sales'], color='red', label='Holiday Weeks')

plt.xlabel('Week')
plt.ylabel('Weekly Sales')
plt.title('Weekly Sales for Store 3')
plt.legend()
plt.grid(True)  
plt.tight_layout()
plt.show()

# Store 4 with non-Holidays
non_holiday_4 = df[(df['Store'] == 4) & (df['Holiday_Flag'] == 0)].iloc[:140]

# Store 4 with Holidays
holiday__4 = df[(df['Store'] == 4) & (df['Holiday_Flag'] == 1)].iloc[:140]

plt.plot(non_holiday_4.index, non_holiday_4['Weekly_Sales'], color='blue', label='Non-Holiday Weeks')

plt.plot(holiday__4.index, holiday__4['Weekly_Sales'], color='red', label='Holiday Weeks')

plt.xlabel('Week')
plt.ylabel('Weekly Sales(In Millions)')
plt.title('Weekly Sales for Store 4')
plt.legend()
plt.grid(True)  
plt.tight_layout()
plt.show()

holiday_diff = df.groupby("Holiday_Flag").mean("Weekly_Sales")

fig, ax = plt.subplots()

divisions = ["Non-Holiday", "Holiday"]
counts = [holiday_diff["Weekly_Sales"][0], holiday_diff["Weekly_Sales"][1]]

ax.bar(divisions, counts, color=["tab:red", "tab:orange"])

ax.set_ylabel("Weekly Sales (in million USD)")
ax.set_title("Non-Holiday & Holiday vs. Weekly Sales")

plt.show()

holiday = df[df["Holiday_Flag"] == 1]["Weekly_Sales"]
no_holiday = df[df["Holiday_Flag"] == 0]["Weekly_Sales"]

result = sci.stats.ttest_ind(holiday, no_holiday)

print(result)

Ttest_indResult(statistic=2.9608919093259036, pvalue=0.003078699263818616)

months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

def determine_month(date):
    return months[date.month-1]

df["Month"] = df["Date"].apply(determine_month)

def avg(lst):
    return sum(lst) / len(lst)

fig, ax = plt.subplots()

list_of_months = [[],[],[],[],[],[],[],[],[],[],[],[]]

for index in range(len(months)):
    curr_month = []
    for ind, item in df[(df["Month"] == months[index]) & (df["Store"] == 1)].iterrows():
        curr_month.append(float(item["Weekly_Sales"]))
    list_of_months[index] = avg(curr_month)

divisions = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
counts = list_of_months

ax.bar(divisions, counts, color=["tab:red"])

ax.set_ylabel("Weekly Sales (in million USD)")
ax.set_title("Month vs. Weekly Sales for Store 1")

plt.show()

p_value_results = []

def anova_test(store):

    january = []
    february = []
    march = []
    april = []
    may = []
    june = []
    july = []
    august = []
    september = []
    october = []
    november = []
    december = []

    list_of_months = [january, february, march, april, may, june, july, august, september, october, november, december]

    for index in range(len(months)):
        for ind, item in df[(df["Month"] == months[index]) & (df["Store"] == store)].iterrows():
            list_of_months[index].append(float(item["Weekly_Sales"]))

    result = sci.stats.f_oneway(january, february, march, april, may, june, july, august, september, october, november, december)
    
    print("Store " + str(store) + " test pvalue: " + str(result.pvalue))
    p_value_results.append(result.pvalue)

for num in range(1, 46):
    anova_test(num)

Store 1 test pvalue: 2.7724825133205463e-07
Store 2 test pvalue: 4.330415081536038e-12
Store 3 test pvalue: 2.3031287514697588e-11
Store 4 test pvalue: 5.044527323318062e-08
Store 5 test pvalue: 3.281519930519039e-08
Store 6 test pvalue: 3.475224266634057e-14
Store 7 test pvalue: 2.2422407017372865e-16
Store 8 test pvalue: 8.009722452114776e-11
Store 9 test pvalue: 2.4807011905056473e-09
Store 10 test pvalue: 1.5526974366305664e-16
Store 11 test pvalue: 1.628313504395449e-12
Store 12 test pvalue: 1.6336003498407762e-12
Store 13 test pvalue: 8.359846513589073e-11
Store 14 test pvalue: 3.9656955165561375e-08
Store 15 test pvalue: 4.640380261674347e-13
Store 16 test pvalue: 1.2039358996831871e-18
Store 17 test pvalue: 0.00011345095945005678
Store 18 test pvalue: 2.3721149491082532e-11
Store 19 test pvalue: 1.0812412446782531e-10
Store 20 test pvalue: 7.210105978784641e-13
Store 21 test pvalue: 1.7292012536913808e-12
Store 22 test pvalue: 9.50619842442727e-14
Store 23 test pvalue: 7.742627133025905e-15
Store 24 test pvalue: 5.094605594267623e-10
Store 25 test pvalue: 3.14810055322353e-18
Store 26 test pvalue: 7.469303673990814e-11
Store 27 test pvalue: 1.538364002585109e-09
Store 28 test pvalue: 2.0184361415396076e-06
Store 29 test pvalue: 5.697335574183187e-12
Store 30 test pvalue: 0.00986728581291941
Store 31 test pvalue: 9.014256351204674e-07
Store 32 test pvalue: 7.904159131772652e-10
Store 33 test pvalue: 0.0014863778723760027
Store 34 test pvalue: 2.391261801278534e-09
Store 35 test pvalue: 0.0002707253217012239
Store 36 test pvalue: 0.555865376310303
Store 37 test pvalue: 0.00016745376727450595
Store 38 test pvalue: 0.9958102761851833
Store 39 test pvalue: 2.099085865751722e-08
Store 40 test pvalue: 4.913845032367567e-10
Store 41 test pvalue: 7.995742455192548e-09
Store 42 test pvalue: 0.6693766507442821
Store 43 test pvalue: 0.40241402241743385
Store 44 test pvalue: 0.3549543354932103
Store 45 test pvalue: 1.8172742505154496e-13

num_less = 0
num_more = 0

for value in p_value_results:
    if value <= 0.05:
        num_less = num_less + 1
    elif value > 0.05:
        num_more = num_more + 1

print("Number of stores with p values less than 0.05: " + str(num_less))
print("Number of stores with p values more than 0.05: " + str(num_more))

Number of stores with p values less than 0.05: 40
Number of stores with p values more than 0.05: 5

# Create a copy of the original df (incase we want the original to do a comparison)
standardizedDf = df.copy()

standardizedDf

from scipy.stats import zscore
standardizedDf['Temperature'] = zscore(standardizedDf['Temperature'])
standardizedDf['Fuel_Price'] = zscore(standardizedDf['Fuel_Price'])
standardizedDf['CPI'] = zscore(standardizedDf['CPI'])
standardizedDf['Unemployment'] = zscore(standardizedDf['Unemployment'])

# What do we do with dates? 
    # If we use them, how do we handle them? 
        # We could do categorical columns for Year, Month, and Week 
    # If we choose not to use them: 
        # Cool, fine, lets see how it performs :)

standardizedDf

standardizedDf['Date']=standardizedDf['Date'].map(dt.datetime.toordinal)

# Also, remove month column 
standardizedDf = standardizedDf.drop('Month', axis=1)
standardizedDf

# Consider setting aside train, validate, test

# Here, we could show training data set accuracy vs validation set accuracy as we go through training epochs
    # This could help us show overfitting vs underfitting

# Define X and y
y = standardizedDf['Weekly_Sales']
X = standardizedDf.drop('Weekly_Sales', axis=1)

# Get train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Perform k-fold Cross-Validation for each model
k_folds = 5
skf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Decide the best degree for our data
for k in range(-1, 8): 

    if k == -1: 
        model = DecisionTreeRegressor()
        score = cross_val_score(model, X_train, y_train, cv=skf, scoring='r2')
        print("Decision Tree Regressor R^2 Val: " + str(score.mean()))
        print("-----------------------------------------------------")
        continue
    if k == 0:
        model = RandomForestRegressor()
        score = cross_val_score(model, X_train, y_train, cv=skf, scoring='r2')
        print("Random Forest Regressor R^2 Val: " + str(score.mean()))
        print("-----------------------------------------------------")
        continue

    model = make_pipeline(PolynomialFeatures(k), LinearRegression())
    score = cross_val_score(model, X_train, y_train, cv=skf, scoring='r2')
    print("Linear Regressor of degree " + str(k) + " R^2 Val: " + str(score.mean()))
    print("-----------------------------------------------------")

Decision Tree Regressor R^2 Val: 0.8778749895542678
-----------------------------------------------------
Random Forest Regressor R^2 Val: 0.9304599256542752
-----------------------------------------------------
Linear Regressor of degree 1 R^2 Val: 0.13397245632480764
-----------------------------------------------------
Linear Regressor of degree 2 R^2 Val: 0.25047806329282685
-----------------------------------------------------
Linear Regressor of degree 3 R^2 Val: 0.3371864409081696
-----------------------------------------------------
Linear Regressor of degree 4 R^2 Val: 0.3378576496910995
-----------------------------------------------------
Linear Regressor of degree 5 R^2 Val: 0.3377816180759822
-----------------------------------------------------
Linear Regressor of degree 6 R^2 Val: 0.32211118760617324
-----------------------------------------------------
Linear Regressor of degree 7 R^2 Val: 0.28173126974169926
-----------------------------------------------------

# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Split train further to get validation set
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

trainScores = []
validationScores = []
depths = []

for depth in tqdm(range(1, 20)): 
    model = RandomForestRegressor(max_depth= depth)
    model.fit(X_train, y_train)
    trainScores.append(mean_squared_error(y_train,model.predict(X_train)))
    validationScores.append(mean_squared_error(y_validation, model.predict(X_validation)))
    depths.append(depth)

tLossSmooothed = gaussian_filter1d(trainScores, sigma=2)
vLossSmooothed = gaussian_filter1d(validationScores, sigma=2)

plt.plot(depths, tLossSmooothed, label='Training loss')
plt.plot(depths, vLossSmooothed, label='Validation loss')
plt.title(label="Smoothed MSE loss")
plt.xlabel(xlabel="depth")
plt.legend()
plt.show()

plt.plot(depths, trainScores, label='Training MSE loss')
plt.plot(depths, validationScores, label='Validation MSE loss')
plt.title(label="MSE loss")
plt.xlabel(xlabel="depth")
plt.legend()
plt.show()

100%|██████████| 19/19 [00:25<00:00,  1.34s/it]

# Fixing depth to 12
depth = 12

trainScores = []
validationScores = []
trees = []

for numTrees in tqdm(range(80, 500, 5)): 
    model = RandomForestRegressor(max_depth= depth, n_estimators=numTrees)
    model.fit(X_train, y_train)
    trainScores.append(mean_squared_error(y_train,model.predict(X_train)))
    validationScores.append(mean_squared_error(y_validation, model.predict(X_validation)))
    trees.append(numTrees)

tLossSmooothed = gaussian_filter1d(trainScores, sigma=2)
vLossSmooothed = gaussian_filter1d(validationScores, sigma=2)

plt.plot(trees, tLossSmooothed, label='Training loss')
plt.plot(trees, vLossSmooothed, label='Validation loss')
plt.title(label="Smoothed MSE loss")
plt.xlabel(xlabel="number of trees")
plt.legend()
plt.show()

plt.plot(trees, trainScores, label='Training MSE loss')
plt.plot(trees, validationScores, label='Validation MSE loss')
plt.title(label="MSE loss")
plt.xlabel(xlabel="number of trees")
plt.legend()
plt.show()

100%|██████████| 84/84 [06:14<00:00,  4.46s/it]

# Splitting the dataset into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Assuming the best depth found is between 8 to 12.

# Digging down to find the best depth
for i in range(8, 13):

    # Create the model with i depth and default number of trees
    model = RandomForestRegressor(max_depth=i)
    model.fit(X_train, y_train)  # Train model on the initial training set

    # Predict the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_test - y_pred))
    r2 = r2_score(y_test, y_pred)

    print("STATISTICS FOR DEPTH %i" % i)
    print("-----------------------------------------")
    print(f"Test MSE: {mse}")
    print(f"Test sqrt of mse: {math.sqrt(mse)}")
    print(f"Test RMSE: {rmse}")
    print(f"Test MAE: {mae}")
    print(f"Test R² score: {r2}")

    # Visualization of Actual vs Predicted
    plt.figure()
    plt.scatter(y_test, y_pred, alpha=0.3, color='blue')  # alpha for transparency
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)  # Ideal line
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title('Actual vs. Predicted at Depth %i' % i)
    plt.show()

    # Residuals plot
    residuals = y_test - y_pred
    plt.figure()
    plt.scatter(y_pred, residuals, alpha=0.3, color='red')
    plt.hlines(y=0, xmin=y_pred.min(), xmax=y_pred.max(), colors='black', linestyles='dashed')
    plt.xlabel('Predicted')
    plt.ylabel('Residuals')
    plt.title('Residuals Plot at Depth %i' % i)
    plt.show()

STATISTICS FOR DEPTH 8
-----------------------------------------
Test MSE: 29778496329.666405
Test sqrt of mse: 172564.470067469
Test RMSE: 172564.470067469
Test MAE: 97833.58436878757
Test R² score: 0.9068608891772729

STATISTICS FOR DEPTH 9
-----------------------------------------
Test MSE: 24801365010.294613
Test sqrt of mse: 157484.49133262175
Test RMSE: 157484.49133262175
Test MAE: 87091.17147131398
Test R² score: 0.9224280145419078

STATISTICS FOR DEPTH 10
-----------------------------------------
Test MSE: 22152826791.83312
Test sqrt of mse: 148838.25715128862
Test RMSE: 148838.25715128862
Test MAE: 80184.27408842757
Test R² score: 0.9307119282733665

STATISTICS FOR DEPTH 11
-----------------------------------------
Test MSE: 21431918678.967293
Test sqrt of mse: 146396.44353250967
Test RMSE: 146396.44353250967
Test MAE: 78543.61870800186
Test R² score: 0.9329667345561914

STATISTICS FOR DEPTH 12
-----------------------------------------
Test MSE: 20968460038.47253
Test sqrt of mse: 144804.9033647429
Test RMSE: 144804.9033647429
Test MAE: 77155.06833436356
Test R² score: 0.9344163082754591

model = RandomForestRegressor(max_depth=12)
model.fit(X_train, y_train)  # Train model on the initial training set

t_sizes, t_scores, test_scores = learning_curve(model, X_train, y_train)

display = LearningCurveDisplay(train_sizes=t_sizes, train_scores=t_scores, test_scores=test_scores, score_name = "R2 Score")
display.plot()
plt.title("R2 Score vs. Number of samples in training set")
plt.show()

# Checking out the most 'important' features in predicting weekly sales
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

feature_importance_df.head(10)

	Store	Date	Weekly_Sales	Holiday_Flag	Temperature	Fuel_Price	CPI	Unemployment
0	1	05-02-2010	1643690.90	0	42.31	2.572	211.096358	8.106
1	1	12-02-2010	1641957.44	1	38.51	2.548	211.242170	8.106
2	1	19-02-2010	1611968.17	0	39.93	2.514	211.289143	8.106
3	1	26-02-2010	1409727.59	0	46.63	2.561	211.319643	8.106
4	1	05-03-2010	1554806.68	0	46.50	2.625	211.350143	8.106

	Store	Date	Weekly_Sales	Holiday_Flag	Temperature	Fuel_Price	CPI	Unemployment
0	1	2010-02-05	1643690.90	0	42.31	2.572	211.096358	8.106
1	1	2010-02-12	1641957.44	1	38.51	2.548	211.242170	8.106
2	1	2010-02-19	1611968.17	0	39.93	2.514	211.289143	8.106
3	1	2010-02-26	1409727.59	0	46.63	2.561	211.319643	8.106
4	1	2010-03-05	1554806.68	0	46.50	2.625	211.350143	8.106

	Store	Date	Weekly_Sales	Holiday_Flag	Temperature	Fuel_Price	CPI	Unemployment	Month
0	1	2010-02-05	1643690.90	0	-0.995136	-1.713800	1.004175	0.056964	February
1	1	2010-02-12	1641957.44	1	-1.201170	-1.766089	1.007880	0.056964	February
2	1	2010-02-19	1611968.17	0	-1.124178	-1.840166	1.009074	0.056964	February
3	1	2010-02-26	1409727.59	0	-0.760907	-1.737766	1.009849	0.056964	February
4	1	2010-03-05	1554806.68	0	-0.767955	-1.598328	1.010624	0.056964	March
...	...	...	...	...	...	...	...	...	...
6430	45	2012-09-28	713173.95	0	0.228602	1.390883	0.519270	0.365109	September
6431	45	2012-10-05	733455.07	0	0.229144	1.364738	0.523256	0.356046	October
6432	45	2012-10-12	734464.36	0	-0.335825	1.397419	0.527241	0.356046	October
6433	45	2012-10-19	718125.53	0	-0.227385	1.329879	0.527332	0.356046	October
6434	45	2012-10-26	760281.43	0	-0.098343	1.140330	0.526775	0.356046	October

	Store	Date	Weekly_Sales	Holiday_Flag	Temperature	Fuel_Price	CPI	Unemployment
0	1	733808	1643690.90	0	-0.995136	-1.713800	1.004175	0.056964
1	1	733815	1641957.44	1	-1.201170	-1.766089	1.007880	0.056964
2	1	733822	1611968.17	0	-1.124178	-1.840166	1.009074	0.056964
3	1	733829	1409727.59	0	-0.760907	-1.737766	1.009849	0.056964
4	1	733836	1554806.68	0	-0.767955	-1.598328	1.010624	0.056964
...	...	...	...	...	...	...	...	...
6430	45	734774	713173.95	0	0.228602	1.390883	0.519270	0.365109
6431	45	734781	733455.07	0	0.229144	1.364738	0.523256	0.356046
6432	45	734788	734464.36	0	-0.335825	1.397419	0.527241	0.356046
6433	45	734795	718125.53	0	-0.227385	1.329879	0.527332	0.356046
6434	45	734802	760281.43	0	-0.098343	1.140330	0.526775	0.356046

	Feature	Importance
0	Store	0.660073
5	CPI	0.160227
6	Unemployment	0.118357
3	Temperature	0.025772
4	Fuel_Price	0.016556
1	Date	0.015550
2	Holiday_Flag	0.003467

What truly influences demand and sales for Walmart?¶

Contribution Details:¶

Contribution Details, Continued:¶

Spring 2024 Data Science Project¶

Data Curation / Importing Data¶

Data Cleaning¶

Exploratory Data Analysis¶

Individual Feature Exploration¶

Stores¶

Taking a look at the number of stores¶

Taking a look at the number of holidays per store¶

Taking a look at the average sales per store¶

Taking a look at average temperature per store¶

Taking a look at the average CPI per store¶

Taking a look at the average unemployment rate per store¶

Summary of findings after exploring the Store column¶

Observed¶

Questions¶

Date¶

Weekly Sales¶

Holiday Flag¶

Temperature¶

Fuel Price¶

Consumer Price Index¶

Unemployment¶

Feature vs Sales¶

Tempature vs Sales¶

Fuel Price vs Sales¶

CPI vs Sales¶

Unemployment vs Sales¶

Holiday vs Sales¶

Month vs. Sales¶

Primary Analysis (Model Exploration) and Visualization¶

Standardizing Features¶

Converting Date Into Ordinal¶

Comparing Linear Regression, Decision Tree, and Random Forest¶

Final Model Selection: RandomForestRegressor¶

Optimizing Hyperparameters¶

Depth Limit¶

Number of Decision Trees¶

Training the Random Forest Model¶

Evaluating Our Initial Model¶

Insights and Conclusions¶