Use of search is important in understanding the performance of the conversion funnel, but how to consider it as part of the funnel is the question.
Step Name | Event Name | Step number |
---|---|---|
HP Visits | home_page | 1 |
Accessed Store Page | store_page | 2 |
Accessed Checkout Page | checkout_page | 3 |
Completed and Order | successful_checkout_page | 4 |
The data for a subset of US users for a 1 week time range:
Field Name | Description |
---|---|
event_time | timestamp without timezone |
user_id | string (hash) |
event_type | string |
platform | string |
country | string |
region | string |
device_id | string (hash) |
initial_referring_domain | string |
import time
import mysql.connector
from datetime import date
import numpy as np
import pandas as pd
from BI_config import (USR, PASS, HST, DB)
mydb = mysql.connector.connect(user=USR, password=PASS,
host=HST,
database=DB)
curA = mydb.cursor(buffered=True, dictionary = True)
file = "business_intelligence_data.csv"
df = pd.read_csv(file)
print(df.info())
print(df.head())
print(df.nunique())
#new funnel model based on search_event
funnel_2 = {'home_page':1,
'store_ordering_page':2,
'search_event':3,
'checkout_page':4,
'checkout_success':5}
funnel_df = pd.DataFrame.from_dict(funnel_2, orient='index')
funnel_df.reset_index(inplace=True)
funnel_df.columns=['event_type', 'funnel_step']
df = pd.merge(df, funnel_df, on='event_type')
In order to be able to drill down on useer behavior in the funnel, I created the following session based variables:
df = df.astype({'event_time':'datetime64', 'event_type':'category', 'platform':'category'})
df.sort_values(['user_id', 'event_time'], inplace=True)
df.reset_index(drop=True, inplace=True)
df['session_id'] = 0
df['event_duration'] = 0
df['event_duration'] = df['event_duration'].astype('timedelta64[ns]')
df['searched'] = 0
df['ordered'] = 0
for i in range(len(df)-1):
df.iloc[i, df.columns.get_loc('session_id')] = i
if df.iloc[i, df.columns.get_loc('user_id')] == df.iloc[i-1, df.columns.get_loc('user_id')] and \
df.iloc[i, df.columns.get_loc('device_id')] == df.iloc[i-1, df.columns.get_loc('device_id')] and \
df.iloc[i-1, df.columns.get_loc('event_type')] != 'checkout_success':
df.iloc[i, df.columns.get_loc('session_id')] = df.iloc[i-1, df.columns.get_loc('session_id')]
df.iloc[i, df.columns.get_loc('event_duration')] = df.iloc[i, df.columns.get_loc('event_time')]\
- df.iloc[i-1, df.columns.get_loc('event_time')]
if df.iloc[i, df.columns.get_loc('event_type')] == 'checkout_success':
df.iloc[i, df.columns.get_loc('ordered')] = 1
if df.iloc[i, df.columns.get_loc('event_type')] == 'search_event':
df.iloc[i, df.columns.get_loc('searched')] = 1
session_df = df.groupby(['user_id', 'session_id', 'event_time', 'region', 'platform',
'initial_referring_domain', 'event_type', 'funnel_step']).agg({'event_duration':'sum'})
searched_ordered_df = df.groupby(['session_id']).agg({'searched':'sum', 'ordered':'sum'})
session_df = session_df.join(searched_ordered_df, on='session_id')
#print(session_df.head(20))
from sqlalchemy import create_engine
engine_str = ("mysql://%s:%s@%s/%s" % (USR, PASS, HST, DB))
engine = create_engine(engine_str)
session_df.to_sql('session_funnel', con=engine, if_exists='append')