Engagement Index Scoring

Published

April 20, 2026

Adapted from: Taki et al. (2017). JMIR Mhealth Uhealth, 5(6), e89. doi: 10.2196/mhealth.7236

Protocol: Thornton et al. (2024). BMC Public Health, 24, 2697.

Purpose of this page is to set up data ready to run a multiphase optimisation strategy full factorial analysis ANCOVA to estimate main effects of each strategy on engagement, controlling for the others.

the primary outcome is the Engagement Index, EI, measured over the 3-month intervention period.

EI is the average of five subindices, each scaled from 0 to 1 where higher values indicate greater engagement:

Ci: click depth
Li: loyalty / login frequency
Vi: volume of contributions
Ri: recency / spacing between sessions
Fi: feedback / uMARS subjective quality score

four factorial components will be included as predictors:

text_msgs
health_coach
gamification
parent_resources

Show/hide the code

library(here)
library(dplyr) # data manipulation
library(tidyr) # reshaping
library(gtsummary) # summaries
library(flextable)
library(officer)
library(huxtable)
library(purrr)
library(lubridate)
# set wd 
i_am("engagement.qmd")

# source utils 
source(here("R/utils.R"))

# load data 
df_long <- readRDS(df_long)

# working engagement index 
load(here("R/data/versions/engagement_data_script4.RData"))

Build engagement index dataset

Start with master participant dataset participants (built from /R/scripts/AH data cleaning/4_TripleE_participants.R) and join uMARS data from follow-up survey (3mth wave) to create a working dataset for engagement index scoring.

Show/hide the code

# get umars data 
umars_df <- df_long |>
  filter(wave == 3) |>
  select(dyad_id, starts_with("h4_umars_"))

# join umars data to engagement data 
engagement_index <- participants |> filter(app_id != 126449) |> 
  left_join(umars_df, by = 'dyad_id')

# create blank variable to calc age 
engagement_index$age_baseline <- 0

# select enagagement vars 
engagement_index <- engagement_index |> 
  select(
    name_clean, 
    dyad_id, 
    app_id, 
    time_baseline,
    time_fu, 
    gender, 
    birthmonth, 
    birth_year, 
    age_baseline, 
    group, 
    starts_with("strategy"), 
    first_login,
    last_login, 
    login_count_summary,
    starts_with("h4_umars")
  )

Recode strategies as effect-coded factors. Raw coding is 1 = received, NA = not received (structural zero). Recode to -1 = not received, 1 = received. Need to do this for the MOST ANOCOVA to estimate main effects of each strategy factor on engagement, controlling for the others.

Show/hide the code

engagement_index <- engagement_index |>
  mutate(
    across(
      starts_with("strategy_"),
      ~ if_else(is.na(.x) | as.integer(as.character(.x)) != 1L, -1L, 1L)
    )
  )

Recode gender as a labelled factor and estimate age at baseline based on birth year and month. Age is calculated as the difference between baseline date and birth date, in years.

Show/hide the code

engagement_index <- engagement_index |> 
  mutate(
    gender = factor(
      gender, 
      levels = c(3, 1, 2, 4, 88), 
      labels = c("Woman or female", "Man or male", "Non-binary", "Other/Prefer not to say", "Other/Prefer not to say")
    ),
    age_baseline = as.numeric(
      difftime(
        time_baseline,
        make_date(birth_year, birthmonth, 15L),
        units = "days"
      )
    ) / 365.25
  )

Joining engagement data

Select variables from each data set and filter to be in participant-specific time window (baseline to follow-up). Datasets:

logins: app_id, login_datetime
modules: app_id, module, started_date, completed_date
goals: app_id, goal_title, goal_date
diaries: app_id, Behaviour, Behaviour.type, Value, diary_date

Show/hide the code

logins <- logins |> 
  select(app_id, login_datetime) |> 
  inner_join(engagement_index |> 
    select(name_clean, app_id, time_baseline, time_fu), by = "app_id") |> 
  mutate(
    valid = as.Date(login_datetime) >= as.Date(time_baseline) & as.Date(login_datetime) <= as.Date(time_fu)) |> 
  mutate(session_date = as.Date(login_datetime, tz = "Australia/Sydney"))

logins <- logins |> filter(valid == TRUE | is.na(valid)) |> 
  select(-c(name_clean, time_baseline, time_fu, valid))
  
# modules 
modules <- modules |> 
  select(
    app_id, module, started_date, completed_date) |> 
  inner_join(engagement_index |> 
    select(name_clean, app_id, time_baseline, time_fu), by = "app_id") |> 
  mutate(valid = as.Date(started_date) >= as.Date(time_baseline) & as.Date(started_date) <= as.Date(time_fu)) |>
  mutate(session_date = as.Date(started_date, tz = "Australia/Sydney"))

modules <- modules |> filter(valid == TRUE | is.na(valid)) |> 
  select(-c(name_clean, time_baseline, time_fu, valid))

# goals 
goals <- goals |> 
  select(app_id, goal_title = Goal.title, goal_date) |> 
  inner_join(engagement_index |> 
    select(name_clean, app_id, time_baseline, time_fu), by = "app_id") |> 
  mutate(valid = as.Date(goal_date) >= as.Date(time_baseline) & as.Date(goal_date) <= as.Date(time_fu)) |>
  mutate(session_date = as.Date(goal_date, tz = "Australia/Sydney"))

goals <- goals |> filter(valid == TRUE | is.na(valid)) |> 
  select(-c(name_clean, time_baseline, time_fu, valid)) 
  
diaries <- diaries |> 
  select(
    app_id, behaviour = Behaviour, behaviour_type = Behaviour.type, 
    diary_value = Value, diary_date) |> 
  inner_join(engagement_index |> 
    select(name_clean, app_id, time_baseline, time_fu), by = "app_id") |> 
  mutate(valid = as.Date(diary_date) >= as.Date(time_baseline) & as.Date(diary_date) <= as.Date(time_fu)) |>
  mutate(session_date = as.Date(diary_date, tz = "Australia/Sydney"))

diaries <- diaries |> filter(valid == TRUE | is.na(valid)) |> 
  select(-c(name_clean, time_baseline, time_fu, valid))

Session aggregation

Create session level data for indices. A session is defined as a distinct calendar day in which the participant had at least one login. For each participant, login days rather than login events are used as activity data is linked to dates, not specific logins.

pages_viewed per session day = number of distinct app sections accessed: - each distinct module counts as one section - diary use on that day counts as one section (binary, not entry count) - goal use on that day counts as one section (binary)

a deep dession is one where pages_viewed >= 2` (i.e., used more than one feature in a single day)

Get distinct session days, module days, goal days, diary days for each participant. These are used to calculate click depth (Ci) and loyalty (Li) subindices.

Show/hide the code

session_days <- logins |> distinct(app_id, session_date)
module_days <- modules |> 
  group_by(app_id, session_date) |>
  summarise(n_module_sections_day = n_distinct(module), .groups = "drop")
diary_days <- diaries |> 
  distinct(app_id, session_date) |>
  mutate(used_diary_day = 1L)
goal_days <- goals |> 
  distinct(app_id, session_date) |> 
  mutate(used_goals_day = 1L)

# combine to get pages_viewed per session day
session_pages <- session_days |>
  left_join(module_days, by = c("app_id", "session_date")) |>
  left_join(diary_days, by = c("app_id", "session_date")) |>
  left_join(goal_days, by = c("app_id", "session_date")) |>
  mutate(
    # replace NA with 0 for participants who had no activity of that type on
    # a given login day
    n_module_sections_day = coalesce(n_module_sections_day, 0L),
    used_diary_day = coalesce(used_diary_day, 0L),
    used_goals_day = coalesce(used_goals_day, 0L),
    # total distinct sections/features accessed in this session
    pages_viewed = n_module_sections_day + used_diary_day + used_goals_day
  )

Participant aggregation

Session level data (logins) and day-level data (modules, goals, diaries) are aggregated to participant level and joined to create a working dataset for engagement index scoring.

Get: - session counts: total sessions (login days), n_sessions_deep (>= 2 sections) - login day stats: days accessed, mean gap between sessions - diary entry counts within time window - goal counts within time window

Show/hide the code

session_agg <- session_pages |>
  group_by(app_id) |>
  summarise(
    n_sessions_total = n(), # total distinct login days
    n_sessions_in_window = n(), 
    n_sessions_deep = sum(pages_viewed >= 2L), # sessions with >= 2 sections
    .groups = "drop"
  )

login_agg <- logins |>
  group_by(app_id) |>
  summarise(
    n_days_app_accessed = n_distinct(session_date),
    mean_days_btw_sessions = {
      d <- sort(unique(session_date))
      # need at least 2 login days to calculate a gap; single-day users get NA
      if (length(d) < 2L) NA_real_ else mean(as.numeric(diff(d)))
    },
    .groups = "drop"
  )

# diary entry count within window 
diary_agg <- diaries |>
  group_by(app_id) |>
  summarise(n_diary_entries = n(), .groups = "drop")

# goal count within window 
goal_agg <- goals |>
  group_by(app_id) |>
  summarise(n_goals = n(), .groups = "drop")

Calculate sub indices

Five subindices (each scored 0–1; higher = better engagement). Join all aggregates onto the participant list, and then compute each sub index. Participants with no app use (no logins in the window will receive 0 for all beahvioural sub indices)

Show/hide the code

ei_scores <- engagement_index |> 
  select(dyad_id, app_id, h4_umars_overall_mean_app_subjective_quality_score) |>
  left_join(session_agg, by = "app_id") |>
  left_join(login_agg, by = "app_id") |>
  left_join(diary_agg, by = "app_id") |>
  left_join(goal_agg, by = "app_id")

Replace NA values with 0 for people who never used the app in the time between baseline / follow up

Show/hide the code

ei_scores <- ei_scores |> 
  mutate(
    across(
      c(n_sessions_total, n_sessions_in_window, n_sessions_deep,
        n_days_app_accessed, n_diary_entries, n_goals),
      ~ coalesce(.x, 0L)
    )
  )

Ci — click depth

based on taki et al. proportion of session days on which participant accessed >= 2 distinct app sections

formula: n_sesssions_deep / n_sessions_total

Question

do we want to base it on taki’s scoring, or is it meant to be binary yes/no ever visited over total possible pages?

Show/hide the code

ei_scores <- ei_scores |>
  mutate(
    Ci = case_when(
      n_sessions_total == 0 ~ 0, 
      TRUE ~ n_sessions_deep / n_sessions_total
    )
  )

hist(ei_scores$Ci, col = "#edcdff", main = "Histogram of Click Depth Index (Ci)", xlab = "Click Depth (Ci)")

Loyalty index (Li)

more sessions = closer to 1

Li = 1-(1/n_logins)

Show/hide the code

ei_scores <- ei_scores |> 
    mutate( 
        Li = case_when(
            n_sessions_in_window == 0 ~ 0, 
            TRUE ~ 1 - (1 / n_sessions_in_window)
        )
    )
hist(ei_scores$Li, col = "#edcdff", main = "Histogram of Loyalty Index (Li)", xlab = "Loyalty (Li)")

Volume of contributions index (Vi)

replaces Taki’s interaction index (we don’t have push notifications)
more contributions = closer to 1, less contributions = closer to 0

Vi = total diary entries / max possible entries

where max possible entries is numbr of days accessed the app x 13 health behaviours

13 possible diary entries per day

Show/hide the code

ei_scores <- ei_scores |> 
    mutate(
        Vi = case_when(
          n_days_app_accessed == 0 ~ 0,
          TRUE ~ pmin(n_diary_entries / (n_days_app_accessed * 13L), 1)
        )
    )
hist(ei_scores$Vi, col = "#edcdff", main = "Histogram of Volume of Contributions (Vi)", xlab = "Volume of Contributions (Vi)")

Recency index (Ri)

smaller gaps = larger Ri

inverse of the mean number of days between sessions, so users who logged in more regularly get higher Ri

Ri = 1 / mean_days_btw_sessions

sessions = calendar days so the minimum possible gap is 1 day… so 1/mean_days is already bounded between 0 and 1, but single session users will have no gap, so we assign them Ri = 1 (accessed app, but no gap to measure… max recency.) non users will get 0.

Show/hide the code

ei_scores <- ei_scores |> 
    mutate(
    Ri_raw = case_when(
      n_sessions_total == 0 ~ NA_real_, 
      n_sessions_total == 1 ~ 1, 
      mean_days_btw_sessions == 0 ~ 1,
      TRUE ~ 1 / mean_days_btw_sessions), 
      Ri = case_when(
        n_sessions_total == 0 ~ 0, # no app use: 0, 
        TRUE ~ Ri_raw))

hist(ei_scores$Ri, col = "#edcdff", main = "Histogram of Recency (Ri)", xlab = "Recency (Ri)")

Feedback index (Fi)

uMARS subjective quality subscale (4 items, each 1-5) at 3mth follow up
rescaled to 0, 1

NA if the participant didnt complete follow up survey.

Fi =(mean_umars_score-1)/(5-1)`

Show/hide the code

ei_scores <- ei_scores |> 
  mutate(Fi = (h4_umars_overall_mean_app_subjective_quality_score - 1) / (5 - 1))
  
hist(ei_scores$Fi, col = "#edcdff", main = "Histogram of Subjective Feedback (Fi)", xlab = "Feedback (Fi)")

Composite Engagement Index

EI = mean(Ci, Li, Vi, Ri, Fi), each on [0, 1].

use na.rm = TRUE so that missing Fi doesnt zero out their whole score

calculate n_subindices_available to see how many sub-indices contributed

Show/hide the code

ei_scores <- ei_scores |>
  mutate(
    n_subindices_available = rowSums(!is.na(cbind(Ci, Li, Vi, Ri, Fi))),
    EI = rowMeans(cbind(Ci, Li, Vi, Ri, Fi), na.rm = TRUE),
    EI_pct = EI * 100

  )
hist(ei_scores$EI, col = "#edcdff", main = "Histogram of Engagement", xlab = "Engagement Index (EI)")

Engagement classification

Quartile-based cut-points following Taki et al.:

Low : EI_pct <= Q1
Moderate : Q1 < EI_pct <= Q3
High : EI_pct > Q3

Show/hide the code

q1 <- quantile(ei_scores$EI_pct, 0.25, na.rm = TRUE)
q3 <- quantile(ei_scores$EI_pct, 0.75, na.rm = TRUE)
 
ei_scores <- ei_scores |>
 mutate(
 engagement_level = case_when(
 EI_pct <= q1 ~ "Low",
 EI_pct <= q3 ~ "Moderate",
 TRUE ~ "High"
 ),
 engagement_level = factor(engagement_level,
 levels = c("Low", "Moderate", "High"))
 )

Build data for main analyses

Show/hide the code

engagement_final <- engagement_index |> 
  left_join(
    ei_scores |>
      select(
        dyad_id,
        n_sessions_total,
        n_sessions_in_window,
        n_sessions_deep,
        n_days_app_accessed,
        mean_days_btw_sessions,
        n_diary_entries,
        n_goals,
        Ci, Li, Vi,
        Ri_raw, Ri,
        Fi,
        n_subindices_available,
        EI,
        EI_pct,
        engagement_quartile = engagement_level
      ),
    by = "dyad_id"
  )

saveRDS(engagement_final, here("R/data/finals/engagement_final.rds"))

Summary

── EI distribution ────────────────────────────────

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1250  0.3848  0.5000  0.4852  0.5937  0.7500

── strategy coding (should be 1 or -1 only) ───────

gamification:


  -1    1 <NA> 
 131  111    0

healthcoach:


  -1    1 <NA> 
 108  134    0

presources:


  -1    1 <NA> 
 134  108    0

sms:


  -1    1 <NA> 
 133  109    0

── sub-index means ─────────────────────────────────

# A tibble: 1 × 6
     Ci    Li    Vi    Ri    Fi    EI
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.177 0.492 0.756 0.529 0.475 0.485