show databases;
┌────────────────┐
│ database_name │
│ varchar │
├────────────────┤
│ data_warehouse │
└────────────────┘
select * from information_schema.schemata;
┌────────────────┬────────────────────┬──────────────┬───┬──────────────────────┬──────────────────────┬──────────┐
│ catalog_name │ schema_name │ schema_owner │ … │ default_character_… │ default_character_… │ sql_path │
│ varchar │ varchar │ varchar │ │ varchar │ varchar │ varchar │
├────────────────┼────────────────────┼──────────────┼───┼──────────────────────┼──────────────────────┼──────────┤
│ data_warehouse │ main │ duckdb │ … │ NULL │ NULL │ NULL │
│ data_warehouse │ raw │ duckdb │ … │ NULL │ NULL │ NULL │
│ system │ information_schema │ duckdb │ … │ NULL │ NULL │ NULL │
│ system │ main │ duckdb │ … │ NULL │ NULL │ NULL │
│ system │ pg_catalog │ duckdb │ … │ NULL │ NULL │ NULL │
│ temp │ main │ duckdb │ … │ NULL │ NULL │ NULL │
├────────────────┴────────────────────┴──────────────┴───┴──────────────────────┴──────────────────────┴──────────┤
│ 6 rows 7 columns (6 shown) │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
use raw;
select 'organizations' as table_name, count(*) as cnt from organizations
union all
select 'members' as table_name, count(*) as cnt from members
union all
select 'events' as table_name, count(*) as cnt from events;
┌───────────────┬─────────┐
│ table_name │ cnt │
│ varchar │ int64 │
├───────────────┼─────────┤
│ organizations │ 17431 │
│ members │ 2065953 │
│ events │ 36956 │
└───────────────┴─────────┘
select * from organizations limit 5;
┌───────┬──────────────────────┬───────────────────────────────────────────────┬────────────┬────────────┬──────────────────────────┐
│ id │ name │ programs │ churned_at │ created_at │ loaded_at │
│ int32 │ varchar │ varchar[] │ date │ date │ timestamp with time zone │
├───────┼──────────────────────┼───────────────────────────────────────────────┼────────────┼────────────┼──────────────────────────┤
│ 1 │ Green Group │ [employee_assistance, primary_care] │ NULL │ 2023-05-10 │ 2023-05-11 01:20:00-04 │
│ 2 │ Hobbs-Ramirez │ [employee_assistance] │ NULL │ 2023-05-10 │ 2023-05-11 01:20:00-04 │
│ 3 │ Chan, Gill and Neal │ [primary_care, mental_health, employee_assi… │ NULL │ 2023-05-10 │ 2023-05-11 01:20:00-04 │
│ 4 │ Young, Martinez an… │ [mental_health] │ NULL │ 2023-05-10 │ 2023-05-11 01:20:00-04 │
│ 5 │ Krause, Fisher and… │ [mental_health, primary_care, employee_assi… │ NULL │ 2023-05-10 │ 2023-05-11 01:20:00-04 │
└───────┴──────────────────────┴───────────────────────────────────────────────┴────────────┴────────────┴──────────────────────────┘
select * from organizations where churned_at is not null limit 5;
┌───────┬─────────┬───────────┬────────────┬────────────┬──────────────────────────┐
│ id │ name │ programs │ churned_at │ created_at │ loaded_at │
│ int32 │ varchar │ varchar[] │ date │ date │ timestamp with time zone │
├───────┴─────────┴───────────┴────────────┴────────────┴──────────────────────────┤
│ 0 rows │
└──────────────────────────────────────────────────────────────────────────────────┘
select * from members limit 5;
┌───────┬─────────────────────┬──────────────────────┬───┬──────────────────────┬─────────────────┬──────────────────────┐
│ id │ name │ email │ … │ loaded_at │ organization_id │ eligible_programs │
│ int32 │ varchar │ varchar │ │ timestamp with tim… │ int32 │ varchar[] │
├───────┼─────────────────────┼──────────────────────┼───┼──────────────────────┼─────────────────┼──────────────────────┤
│ 6 │ William Ho │ fhodge@example.net │ … │ 2023-05-11 01:20:0… │ 12 │ [employee_assistan… │
│ 53 │ Leslie Barajas │ ucortez@example.org │ … │ 2023-05-11 01:20:0… │ 16 │ [employee_assistan… │
│ 100 │ Chase Johnson │ lewisgregory@examp… │ … │ 2023-05-11 01:20:0… │ 1 │ [employee_assistan… │
│ 107 │ Christopher Collins │ ashlee59@example.org │ … │ 2023-05-11 01:20:0… │ 6 │ [mental_health] │
│ 125 │ James Hartman │ rmorris@example.org │ … │ 2023-05-11 01:20:0… │ 18 │ [primary_care, men… │
├───────┴─────────────────────┴──────────────────────┴───┴──────────────────────┴─────────────────┴──────────────────────┤
│ 5 rows 9 columns (6 shown) │
└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
select * from events limit 5;
┌──────────────────────┬──────────────────────┬────────────────────┬────────────────────────────────────────────────────────────────┐
│ timestamp │ event_id │ event_type │ data │
│ timestamp │ varchar │ varchar │ json │
├──────────────────────┼──────────────────────┼────────────────────┼────────────────────────────────────────────────────────────────┤
│ 2023-05-10 00:00:0… │ d8699dc9-79df-49fa… │ episode_properties │ {"episode_id":1,"subject_member_id":136,"issue_type":null,"t… │
│ 2023-05-10 00:38:0… │ a138ad70-3c1f-45b4… │ episode_properties │ {"episode_id":1,"subject_member_id":136,"issue_type":null,"t… │
│ 2023-05-10 00:00:0… │ d0bdbb91-e750-4305… │ episode_properties │ {"episode_id":2,"subject_member_id":285,"issue_type":null,"t… │
│ 2023-05-10 00:44:0… │ 7e53f500-7f0e-4354… │ episode_properties │ {"episode_id":2,"subject_member_id":285,"issue_type":null,"t… │
│ 2023-05-10 01:00:0… │ c9608429-6dc1-4de2… │ episode_properties │ {"episode_id":2,"subject_member_id":285,"issue_type":"substa… │
└──────────────────────┴──────────────────────┴────────────────────┴────────────────────────────────────────────────────────────────┘
stg_organizations.sql
Added an extra column is_active
, derived from the churned_at
field:
is_active = true
when churned_at
is null
.is_active = false
otherwise.Ensured organization_id
is not null to maintain data quality.
with raw_organizations as (
select
id as organization_id,
name as organization_name,
created_at,
churned_at,
programs,
loaded_at
from {{ source('raw', 'organizations') }}
),
cleaned_organizations as (
select
organization_id,
organization_name,
created_at,
churned_at,
programs,
loaded_at,
case
when churned_at is null then true
else false
end as is_active
from raw_organizations
where organization_id is not null
)
select *
from cleaned_organizations
stg_members.sql
Applied a condition to exclude invalid records by filtering out rows where member_id
is null.
Kept all relevant attributes (personal details, organization, eligibility, timestamps) intact for downstream use.
with raw_members as (
select
id as member_id,
name,
email,
date_of_birth,
province_of_residence,
organization_id,
eligible_programs,
created_at,
loaded_at
from {{ source('raw', 'members') }}
where member_id is not null
)
select *
from raw_members
stg_events.sql