Back to Article
1-intervention-analysis.ipynb
Download Notebook
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
df1 = pd.read_csv("anonymized_all.csv")
df2 = pd.read_csv("anonymized_fy24_only.csv")
In [3]:
df1.head()
Unique Identifier Start Date Client Name Job ID
0 537926 6/14/21 537926 Record job-188418
1 476870 6/18/21 476870 Record job-188751
2 458255 6/14/21 458255 Record job-189026
3 479256 8/4/20 479256 Record job-181914
4 472697 8/7/20 472697 Record job-181916
In [4]:
df2.head()
Unique Identifier Start Date Client Name Job ID
0 581013 9/25/23 581013 Record job-193451
1 583586 7/17/23 583586 Record job-198299
2 585047 7/11/23 585047 Record job-198331
3 576444 7/9/23 576444 Record job-198365
4 585401 9/18/23 585401 Record job-198436
In [5]:
def clean_df(df, date_col, drop_col):
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.drop([drop_col], axis = 1)
    df.columns = df.columns.str.lower()
    
    return df

cleaned_df1 = clean_df(df1,'Start Date', 'Client Name')
cleaned_df2 = clean_df(df2,'Start Date', 'Client Name')
In [6]:
cleaned_df1.head()
unique identifier start date job id
0 537926 2021-06-14 job-188418
1 476870 2021-06-18 job-188751
2 458255 2021-06-14 job-189026
3 479256 2020-08-04 job-181914
4 472697 2020-08-07 job-181916
In [7]:
def fiscal_year(date):
    if date.month >=7:
        return date.year + 1
    else:
        return date.year
    
cleaned_df1['fiscal year'] = cleaned_df1['start date'].apply(fiscal_year)
cleaned_df2['fiscal year'] = cleaned_df2['start date'].apply(fiscal_year)
In [8]:
cleaned_df1['fiscal year'].unique()
array([2021, 2022, 2023, 2024])
In [9]:
cleaned_df2['fiscal year'].unique()
array([2024])
In [10]:
# <!-- ```{python}
# merged_df = pd.merge(cleaned_all, cleaned_fy24, how = 'outer')
# merged_df
# ```



# ```{python}
# merged_df['fiscal year'] = merged_df['start date'].apply(get_fiscal_year)
# merged_df.head()
# ```
# ```{python}
# merged_df['fiscal year'].unique()
# ```

# ```{python}
# merged_df = merged_df[merged_df['fiscal year'].isin([2023, 2024])]
# merged_df
# ```

# ```{python}
# weekly_performance = (
#   merged_df
#   .groupby(['fiscal year', pd.Grouper(key='start date', freq='W')]).size()
#   .reset_index(name = 'weekly_performance')
# )

# weekly_performance
# ```

# ```{python}
# weekly_performance['cumulative_performance'] = (
#   weekly_performance
#   .groupby('fiscal year')['weekly_performance']
#   .cumsum()
# )

# weekly_performance
# ```
# ```{python}
# # Pivot the data
# weekly_performance['start date'] = weekly_performance['start date'].dt.strftime('%m-%d')

# pivoted_data = weekly_performance.pivot(index='start date', columns='fiscal year', values='cumulative_performance')
# pivoted_data.columns = [f'Cumulative Performance FY{year}' for year in pivoted_data.columns]

# pivoted_data.reset_index(inplace=True)

# pivoted_data
# ```
# ```{python}
# # Plotting
# fig, ax = plt.subplots(figsize=(10, 6))
# pivoted_data.plot(x='start date', ax=ax)

# ax.set_title('Cumulative Weekly Performance by Fiscal Year')
# ax.set_xlabel('Date')
# ax.set_ylabel('Cumulative Performance')
# ax.legend()

# plt.show()
# ```
# ```{python}
# weekly_performance.to_csv("weekly_performance.csv", index = False) 
# ``` -->
In [11]:
cleaned_previous = cleaned_df1[cleaned_df1['fiscal year'].isin([2024])]
cleaned_previous
unique identifier start date job id fiscal year
8381 581013 2023-09-25 job-193451 2024
8382 583586 2023-07-17 job-198299 2024
8383 585047 2023-07-11 job-198331 2024
8384 576444 2023-07-09 job-198365 2024
8385 585401 2023-09-18 job-198436 2024
... ... ... ... ...
11009 619903 2024-03-13 job-205994 2024
11010 602062 2024-03-26 job-206010 2024
11011 598480 2024-03-06 job-206013 2024
11012 615498 2024-03-03 job-206017 2024
11013 460343 2024-02-11 job-206027 2024

2633 rows × 4 columns

In [12]:
previous_weekly_performance = (
  cleaned_previous
  .groupby(['fiscal year', pd.Grouper(key='start date', freq='W')]).size()
  .reset_index(name = 'previous_weekly_performance')
)
previous_weekly_performance.head()
fiscal year start date previous_weekly_performance
0 2024 2023-07-02 15
1 2024 2023-07-09 77
2 2024 2023-07-16 99
3 2024 2023-07-23 71
4 2024 2023-07-30 72
In [13]:
previous_weekly_performance['previous_cumulative_performance'] = (
  previous_weekly_performance
  .groupby('fiscal year')['previous_weekly_performance']
  .cumsum()
)
previous_weekly_performance.head()
fiscal year start date previous_weekly_performance previous_cumulative_performance
0 2024 2023-07-02 15 15
1 2024 2023-07-09 77 92
2 2024 2023-07-16 99 191
3 2024 2023-07-23 71 262
4 2024 2023-07-30 72 334
In [14]:
current_weekly_performance = (
  cleaned_df2
  .groupby(['fiscal year', pd.Grouper(key='start date', freq='W')]).size()
  .reset_index(name = 'current_weekly_performance')
)
current_weekly_performance.head()
fiscal year start date current_weekly_performance
0 2024 2023-07-02 15
1 2024 2023-07-09 77
2 2024 2023-07-16 98
3 2024 2023-07-23 72
4 2024 2023-07-30 72
In [15]:
current_weekly_performance['current_cumulative_performance']= (
  current_weekly_performance
  .groupby('fiscal year')['current_weekly_performance']
  .cumsum()
)
current_weekly_performance.head()
fiscal year start date current_weekly_performance current_cumulative_performance
0 2024 2023-07-02 15 15
1 2024 2023-07-09 77 92
2 2024 2023-07-16 98 190
3 2024 2023-07-23 72 262
4 2024 2023-07-30 72 334
In [16]:
merged_df = pd.merge(previous_weekly_performance, current_weekly_performance, how = 'outer')
merged_df.head()
fiscal year start date previous_weekly_performance previous_cumulative_performance current_weekly_performance current_cumulative_performance
0 2024 2023-07-02 15 15 15 15
1 2024 2023-07-09 77 92 77 92
2 2024 2023-07-16 99 191 98 190
3 2024 2023-07-23 71 262 72 262
4 2024 2023-07-30 72 334 72 334
In [17]:
merged_df.to_csv("weekly_performance.csv", index = False)
# Plotting 1
# Set 'start date' as the index for better plotting
merged_df.set_index('start date', inplace=True)

fig, ax = plt.subplots(figsize=(12, 8))

# Width of the bars
bar_width = 0.35

# Positions of the bars
index = range(len(merged_df))

# Plotting 'previous_weekly_performance'
bars1 = ax.bar(index, merged_df['previous_weekly_performance'], width=bar_width, label='Previous Weekly Performance', color='b')

# Plotting 'current_weekly_performance' shifted right by bar_width to place next to the previous
bars2 = ax.bar([p + bar_width for p in index], merged_df['current_weekly_performance'], width=bar_width, label='Current Weekly Performance', color='r')

# Adding labels, title, and legend
ax.set_xlabel('Start Date')
ax.set_ylabel('Weekly Performance')
ax.set_title('Comparison of Previous and Current Weekly Performance')
ax.legend()

# Set x-ticks to be in the middle of the two bars for each date
ax.set_xticks([p + bar_width / 2 for p in index])
ax.set_xticklabels(merged_df.index.strftime('%Y-%m-%d'), rotation=45)

plt.show()


# Plotting 2
# Resetting the index to put 'start date' back as a regular column
merged_df.reset_index(inplace=True)


plt.figure(figsize=(10, 6))

# Plotting the 'previous_cumulative_performance'
plt.plot(merged_df['start date'], merged_df['previous_cumulative_performance'], label='Previous Cumulative Performance', marker='o')

# Plotting the 'current_cumulative_performance'
plt.plot(merged_df['start date'], merged_df['current_cumulative_performance'], label='Current Cumulative Performance', marker='o')
 
plt.title('Comparison of Previous and Current Cumulative Performance Over Time')
plt.xlabel('Start Date')
plt.ylabel('Cumulative Performance')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)  # Rotates the dates on the x-axis for better visibility
plt.tight_layout()  # Adjusts plot parameters to give some padding and prevent overlap

plt.show()
(a) The bar chart shows weekly performance trends for previous and current periods, highlighting fluctuations and potential seasonal patterns. This dense representation helps identify key variations but can be visually complex.
(b) The line chart offers a clearer view of performance over time by smoothing out weekly noise and emphasizing long-term trends.
Figure 1: Initial stages of the data visualization workflow, creating basic plots to explore and refine key trends