RelBench Rel-Stack with DuckDB (Post Votes as Label) (Regression)

Open full-size: SVG

This example follows the same structure as the user-badges workflow but for the post-votes task: Posts.csv is the parent node and the target is whether a post receives votes in the next 90 days.

This example dynamically downloads the hosted RelBench rel-stack dataset CSV files, instantiates all tables as DuckdbNode objects, and predicts future vote activity using Votes.csv as the label node.

Hosted dataset root: https://open-relbench.s3.us-east-1.amazonaws.com/rel-stack

Tables used:

Users.csv
Posts.csv
Badges.csv
PostHistory.csv
PostLinks.csv
Votes.csv
Comments.csv
Tags.csv

Complete Example

Data Preparation

Show Code

import datetime
from pathlib import Path
from urllib.request import urlretrieve

import duckdb

from graphreduce.graph_reduce import GraphReduce
from graphreduce.node import DuckdbNode
from graphreduce.enum import ComputeLayerEnum, PeriodUnit, SQLOpType
from graphreduce.models import sqlop

BASE_URL = "https://open-relbench.s3.us-east-1.amazonaws.com/rel-stack"
TABLES = [
    "Users.csv",
    "Posts.csv",
    "Badges.csv",
    "PostHistory.csv",
    "PostLinks.csv",
    "Votes.csv",
    "Comments.csv",
    "Tags.csv",
]

data_dir = Path("data/relbench/rel-stack")
data_dir.mkdir(parents=True, exist_ok=True)

# Download data dynamically (skip files that already exist).
for table in TABLES:
    out_path = data_dir / table
    if not out_path.exists():
        urlretrieve(f"{BASE_URL}/{table}", out_path)

con = duckdb.connect()
cut_date = datetime.datetime(2020, 1, 1)


post = DuckdbNode(
    fpath=f"'{data_dir / 'Posts.csv'}'",
    prefix="post",
    pk="Id",
    date_key="CreationDate",
    columns=["Id", "OwnerUserId", "PostTypeId", "AcceptedAnswerId", "ParentId", "Title", "Tags", "Body", "CreationDate"],
    table_name="posts",
    do_filters_ops=[
        sqlop(
            optype=SQLOpType.where,
            opval=f"post_CreationDate <= '{cut_date.date()}'",
        ),
        sqlop(optype=SQLOpType.where, opval="post_PostTypeId = 1"),
        sqlop(optype=SQLOpType.where, opval="post_OwnerUserId is not null"),
        sqlop(optype=SQLOpType.where, opval="post_OwnerUserId != -1"),
    ],
)

vote = DuckdbNode(
    fpath=f"'{data_dir / 'Votes.csv'}'",
    prefix="vote",
    pk="Id",
    date_key="CreationDate",
    columns=["Id", "PostId", "VoteTypeId", "UserId", "CreationDate"],
    table_name="votes",
    do_labels_ops=[
        sqlop(
            optype=SQLOpType.aggfunc,
            opval="sum(case when vote_VoteTypeId = 2 then 1 else 0 end) as vote_positive_votes_label",
        ),
        sqlop(optype=SQLOpType.agg, opval="vote_PostId"),
    ],
)

comment = DuckdbNode(
    fpath=f"'{data_dir / 'Comments.csv'}'",
    prefix="comm",
    pk="Id",
    date_key="CreationDate",
    columns=["Id", "PostId", "Text", "CreationDate", "UserId", "ContentLicense"],
    table_name="comments",
)

post_history = DuckdbNode(
    fpath=f"'{data_dir / 'PostHistory.csv'}'",
    prefix="ph",
    pk="Id",
    date_key="CreationDate",
    columns=["Id", "PostHistoryTypeId", "PostId", "RevisionGUID", "CreationDate", "UserId", "Text", "Comment", "ContentLicense"],
    table_name="post_history",
)

post_links = DuckdbNode(
    fpath=f"'{data_dir / 'PostLinks.csv'}'",
    prefix="plink",
    pk="Id",
    date_key="CreationDate",
    columns=["Id", "CreationDate", "PostId", "RelatedPostId", "LinkTypeId"],
    table_name="post_links",
)

tag = DuckdbNode(
    fpath=f"'{data_dir / 'Tags.csv'}'",
    prefix="tag",
    pk="Id",
    date_key=None,
    columns=["Id", "TagName", "Count", "ExcerptPostId", "WikiPostId"],
    table_name="tags",
)

user = DuckdbNode(
    fpath=f"'{data_dir / 'Users.csv'}'",
    prefix="user",
    pk="Id",
    date_key="CreationDate",
    columns=["Id", "DisplayName", "Location", "ProfileImageUrl", "WebsiteUrl", "AboutMe", "CreationDate"],
    table_name="users",
)

badge = DuckdbNode(
    fpath=f"'{data_dir / 'Badges.csv'}'",
    prefix="bad",
    pk="Id",
    date_key="Date",
    columns=["Id", "UserId", "Class", "Name", "Date"],
    table_name="badges",
)

gr = GraphReduce(
    name="rel-stack-post-votes",
    parent_node=post,
    compute_layer=ComputeLayerEnum.duckdb,
    sql_client=con,
    cut_date=cut_date,
    compute_period_val=3650,
    compute_period_unit=PeriodUnit.day,
    date_filters_on_agg=True,
    auto_features=True,
    label_node=vote,
    label_period_val=90,
    label_period_unit=PeriodUnit.day,
    auto_feature_hops_back=4,
    auto_feature_hops_front=0,
)

for node in [post, vote, comment, post_history, post_links, tag, user, badge]:
    gr.add_node(node)

# Post-centric rollups.
gr.add_entity_edge(parent_node=post, relation_node=vote, parent_key="Id", relation_key="PostId", reduce=True)
gr.add_entity_edge(parent_node=post, relation_node=comment, parent_key="Id", relation_key="PostId", reduce=True)
gr.add_entity_edge(parent_node=post, relation_node=post_history, parent_key="Id", relation_key="PostId", reduce=True)
gr.add_entity_edge(parent_node=post, relation_node=post_links, parent_key="Id", relation_key="PostId", reduce=True)
gr.add_entity_edge(parent_node=post, relation_node=tag, parent_key="Id", relation_key="ExcerptPostId", reduce=True)

# Owner-user branch and badges propagated from user to post.
gr.add_entity_edge(parent_node=post, relation_node=user, parent_key="OwnerUserId", relation_key="Id", reduce=True)
gr.add_entity_edge(parent_node=user, relation_node=badge, parent_key="Id", relation_key="UserId", reduce=True)

gr.do_transformations_sql()

out_df = con.sql(f"select * from {gr.parent_node._cur_data_ref}").to_df()
print("rows:", len(out_df))
print("columns:", len(out_df.columns))
df = out_df.copy()

# Label columns are generated from the votes label node.
label_cols = [c for c in df.columns if c.startswith("vote_") and "label" in c.lower()]
print("label columns:", label_cols)
print(df.head())

Model Training