VirtualDB Tutorial: SQL-First Cross-Dataset Queries¶

The VirtualDB class provides a SQL query interface across heterogeneous Huggingface datasets stored in Parquet format. See the VirtualDB configuration documentation for more details on how to set up your datasets for use with VirtualDB.

Configuration¶

The configuration for VirtualDB is defined in a YAML file that specifies the datasets to include, their locations, and any relevant metadata or mappings. Below is an example configuration:

In [1]:

Copied!





config_yaml = """
repositories:
  BrentLab/harbison_2004:
    tags:
      assay: binding
      method: chip-chip
      organism: yeast
    dataset:
      harbison_2004:
        db_name: harbison
        sample_id:
          field: sample_id
        carbon_source:
          field: condition
          path: media.carbon_source.compound
        temperature_celsius:
          field: condition
          path: temperature_celsius
          dtype: numeric
        regulator_locus_tag:
          field: regulator_locus_tag
        regulator_symbol:
          field: regulator_symbol

  BrentLab/kemmeren_2014:
    tags:
      assay: perturbation
      method: microarray
      organism: yeast
    dataset:
      kemmeren_2014:
        db_name: kemmeren
        sample_id:
          field: sample_id
        carbon_source:
          path: experimental_conditions.media.carbon_source.compound
        temperature_celsius:
          path: experimental_conditions.temperature_celsius
          dtype: numeric
        regulator_locus_tag:
          field: regulator_locus_tag
        regulator_symbol:
          field: regulator_symbol

  BrentLab/hackett_2020:
    # Repo-level tags apply to all datasets in this repository
    tags:
      method: test_overwrite
      organism: yeast
    dataset:
      hackett_2020:
        # Dataset-level tags: 'assay' is new,
        # 'method' overrides the repo-level value
        tags:
          assay: perturbation
          method: overexpression
        db_name: hackett
        sample_id:
          field: sample_id
        carbon_source:
          path: experimental_conditions.media.carbon_source.compound
        temperature_celsius:
          path: experimental_conditions.temperature_celsius
          dtype: numeric
        regulator_locus_tag:
          field: regulator_locus_tag
        regulator_symbol:
          field: regulator_symbol

  BrentLab/yeast_comparative_analysis:
    dataset:
      dto:
        dto_pvalue:
          field: dto_empirical_pvalue
        dto_fdr:
          field: dto_fdr
        links:
          binding_id:
            - [BrentLab/harbison_2004, harbison_2004]
          perturbation_id:
            - [BrentLab/kemmeren_2014, kemmeren_2014]
            - [BrentLab/hackett_2020, hackett_2020]

factor_aliases:
  carbon_source:
    glucose: [D-glucose, dextrose, glu]
    galactose: [D-galactose, gal]
    raffinose: [D-raffinose]

missing_value_labels:
  carbon_source: unspecified

description:
  carbon_source: The carbon source provided during growth
  temperature_celsius: Growth temperature in degrees Celsius
"""

import tempfile
from pathlib import Path

temp_config = Path(tempfile.mkdtemp()) / "vdb_config.yaml"
temp_config.write_text(config_yaml)
print(f"Config saved to: {temp_config}")
config_yaml = """
repositories:
  BrentLab/harbison_2004:
    tags:
      assay: binding
      method: chip-chip
      organism: yeast
    dataset:
      harbison_2004:
        db_name: harbison
        sample_id:
          field: sample_id
        carbon_source:
          field: condition
          path: media.carbon_source.compound
        temperature_celsius:
          field: condition
          path: temperature_celsius
          dtype: numeric
        regulator_locus_tag:
          field: regulator_locus_tag
        regulator_symbol:
          field: regulator_symbol

  BrentLab/kemmeren_2014:
    tags:
      assay: perturbation
      method: microarray
      organism: yeast
    dataset:
      kemmeren_2014:
        db_name: kemmeren
        sample_id:
          field: sample_id
        carbon_source:
          path: experimental_conditions.media.carbon_source.compound
        temperature_celsius:
          path: experimental_conditions.temperature_celsius
          dtype: numeric
        regulator_locus_tag:
          field: regulator_locus_tag
        regulator_symbol:
          field: regulator_symbol

  BrentLab/hackett_2020:
    # Repo-level tags apply to all datasets in this repository
    tags:
      method: test_overwrite
      organism: yeast
    dataset:
      hackett_2020:
        # Dataset-level tags: 'assay' is new,
        # 'method' overrides the repo-level value
        tags:
          assay: perturbation
          method: overexpression
        db_name: hackett
        sample_id:
          field: sample_id
        carbon_source:
          path: experimental_conditions.media.carbon_source.compound
        temperature_celsius:
          path: experimental_conditions.temperature_celsius
          dtype: numeric
        regulator_locus_tag:
          field: regulator_locus_tag
        regulator_symbol:
          field: regulator_symbol

  BrentLab/yeast_comparative_analysis:
    dataset:
      dto:
        dto_pvalue:
          field: dto_empirical_pvalue
        dto_fdr:
          field: dto_fdr
        links:
          binding_id:
            - [BrentLab/harbison_2004, harbison_2004]
          perturbation_id:
            - [BrentLab/kemmeren_2014, kemmeren_2014]
            - [BrentLab/hackett_2020, hackett_2020]

factor_aliases:
  carbon_source:
    glucose: [D-glucose, dextrose, glu]
    galactose: [D-galactose, gal]
    raffinose: [D-raffinose]

missing_value_labels:
  carbon_source: unspecified

description:
  carbon_source: The carbon source provided during growth
  temperature_celsius: Growth temperature in degrees Celsius
"""

import tempfile
from pathlib import Path

temp_config = Path(tempfile.mkdtemp()) / "vdb_config.yaml"
temp_config.write_text(config_yaml)
print(f"Config saved to: {temp_config}")

Config saved to: /tmp/tmpjxc9wv_g/vdb_config.yaml

Initializing VirtualDB¶

Creating a VirtualDB instance loads and validates the config, downloads any necessary data, and registers all views immediately.

In [2]:

Copied!





from tfbpapi.virtual_db import VirtualDB

# Pass an HF token if the repos are private:
# vdb = VirtualDB(str(temp_config), token="hf_...")
vdb = VirtualDB(str(temp_config))
print(repr(vdb))
from tfbpapi.virtual_db import VirtualDB

# Pass an HF token if the repos are private:
# vdb = VirtualDB(str(temp_config), token="hf_...")
vdb = VirtualDB(str(temp_config))
print(repr(vdb))

/home/chase/code/tfbp/tfbpapi/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 11618.57it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 6061.13it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 7206.71it/s]
Fetching 36 files: 100%|██████████| 36/36 [00:00<00:00, 8037.20it/s]
Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])
Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])
Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])
Key 'temperature_celsius' not found at path 'temperature_celsius' (current keys: ['description', 'initial_temperature_celsius', 'temperature_shift_celsius', 'temperature_shift_duration_minutes', 'growth_phase_at_harvest', 'media'])

VirtualDB(4 repos, 4 datasets, 7 views)

Listing datasets¶

To list the datasets available in the VirtualDB instance, use get_datasets().

In [3]:

Copied!

print("\nDatasets:")
for dataset in vdb.get_datasets():
    print(f"- {dataset}")
print("\nDatasets:")
for dataset in vdb.get_datasets():
    print(f"- {dataset}")

Datasets:
- dto
- hackett
- harbison
- kemmeren

Tags¶

Tags are arbitrary key/value annotations defined in the configuration. They follow the same hierarchy as property mappings: repo-level tags apply to all datasets in that repository, and dataset-level tags override repo-level tags with the same key.

Use config.get_tags(repo_id, config_name) to retrieve the merged tags for any dataset.

In [4]:

Copied!





# Tags are accessible directly from the VirtualDB instance using the db_name.
# No need to import MetadataConfig or specify repo_id.
print("harbison tags:", vdb.get_tags("harbison"))
print("kemmeren tags:", vdb.get_tags("kemmeren"))

# Hackett has tags at both levels:
# 'organism' comes from the repo level only,
# 'assay' is added at the dataset level only,
# 'method' is defined at both levels -- the dataset value wins.
print("hackett tags:", vdb.get_tags("hackett"))

# Dataset with no tags returns empty dict
print("dto tags:", vdb.get_tags("dto"))

# Tags are accessible directly from the VirtualDB instance using the db_name.
# No need to import MetadataConfig or specify repo_id.
print("harbison tags:", vdb.get_tags("harbison"))
print("kemmeren tags:", vdb.get_tags("kemmeren"))

# Hackett has tags at both levels:
# 'organism' comes from the repo level only,
# 'assay' is added at the dataset level only,
# 'method' is defined at both levels -- the dataset value wins.
print("hackett tags:", vdb.get_tags("hackett"))

# Dataset with no tags returns empty dict
print("dto tags:", vdb.get_tags("dto"))

harbison tags: {'assay': 'binding', 'method': 'chip-chip', 'organism': 'yeast'}
kemmeren tags: {'assay': 'perturbation', 'method': 'microarray', 'organism': 'yeast'}
hackett tags: {'method': 'overexpression', 'organism': 'yeast', 'assay': 'perturbation'}
dto tags: {}

Schema Discovery¶

Use tables(), describe(), get_fields(), and get_common_fields() to explore the registered views before writing SQL.

Note that primary datasets get two views each:

<db_name> -- the full measurement-level data (one row per sample-target pair)
<db_name>_meta -- deduplicated sample-level metadata (one row per sample), including derived columns from config property mappings (e.g., carbon_source resolved from DataCard field definitions, with factor aliases applied).

In [5]:

Copied!





# List all registered views
print("Registered views:")
for name in vdb.tables():
    print(f"  {name}")
# List all registered views
print("Registered views:")
for name in vdb.tables():
    print(f"  {name}")

Registered views:
  dto_expanded
  hackett
  hackett_meta
  harbison
  harbison_meta
  kemmeren
  kemmeren_meta

In [6]:

Copied!

# The _meta view has sample-level metadata plus derived columns
# (carbon_source, temperature_celsius resolved from condition definitions)
vdb.describe("harbison_meta")
# The _meta view has sample-level metadata plus derived columns
# (carbon_source, temperature_celsius resolved from condition definitions)
vdb.describe("harbison_meta")

Out[6]:

	table	column_name	column_type	null	key	default	extra
0	harbison_meta	sample_id	INTEGER	YES	None	None	None
1	harbison_meta	condition	VARCHAR	YES	None	None	None
2	harbison_meta	regulator_locus_tag	VARCHAR	YES	None	None	None
3	harbison_meta	regulator_symbol	VARCHAR	YES	None	None	None
4	harbison_meta	carbon_source	VARCHAR	YES	None	None	None
5	harbison_meta	temperature_celsius	DOUBLE	YES	None	None	None

In [7]:

Copied!

# The full view has measurement-level data (one row per sample-target pair)
vdb.describe("harbison")
# The full view has measurement-level data (one row per sample-target pair)
vdb.describe("harbison")

Out[7]:

	table	column_name	column_type	null	key	default	extra
0	harbison	sample_id	INTEGER	YES	None	None	None
1	harbison	db_id	DOUBLE	YES	None	None	None
2	harbison	regulator_locus_tag	VARCHAR	YES	None	None	None
3	harbison	regulator_symbol	VARCHAR	YES	None	None	None
4	harbison	condition	VARCHAR	YES	None	None	None
5	harbison	target_locus_tag	VARCHAR	YES	None	None	None
6	harbison	target_symbol	VARCHAR	YES	None	None	None
7	harbison	effect	DOUBLE	YES	None	None	None
8	harbison	pvalue	DOUBLE	YES	None	None	None
9	harbison	carbon_source	VARCHAR	YES	None	None	None
10	harbison	temperature_celsius	DOUBLE	YES	None	None	None

In [8]:

Copied!

# Columns common to ALL primary dataset views
print("Common fields:", vdb.get_common_fields())
# Columns common to ALL primary dataset views
print("Common fields:", vdb.get_common_fields())

Common fields: ['carbon_source', 'regulator_locus_tag', 'regulator_symbol', 'sample_id', 'temperature_celsius']

Missing Value Labels¶

When a property key is listed under missing_value_labels, every dataset that does not have an explicit mapping for that property will still expose the column in its _meta view, filled with the configured fallback string.

In the config above, carbon_source: unspecified is set in missing_value_labels. All three datasets (harbison, kemmeren, hackett) happen to have an explicit carbon_source mapping, so they resolve real values.

To demonstrate the fallback, we build a minimal config that omits carbon_source from kemmeren. Without missing_value_labels, kemmeren would have no carbon_source column at all. With it, the column appears with the default value.

In [9]:

Copied!





minimal_yaml = """
repositories:
  BrentLab/harbison_2004:
    dataset:
      harbison_2004:
        db_name: harbison2
        sample_id:
          field: sample_id
        # harbison has carbon_source mapped via field+path
        carbon_source:
          field: condition
          path: media.carbon_source.compound

  BrentLab/kemmeren_2014:
    dataset:
      kemmeren_2014:
        db_name: kemmeren2
        sample_id:
          field: sample_id
        # kemmeren has NO carbon_source mapping -- fallback will apply

factor_aliases:
  carbon_source:
    glucose: [D-glucose, dextrose, glu]

missing_value_labels:
  carbon_source: unspecified
"""

import tempfile
from pathlib import Path
from tfbpapi.virtual_db import VirtualDB

p = Path(tempfile.mkdtemp()) / "minimal.yaml"
p.write_text(minimal_yaml)
vdb2 = VirtualDB(str(p))

# harbison resolves real values from DataCard definitions
print("harbison2 carbon_source values:")
print(vdb2.query("SELECT DISTINCT carbon_source FROM harbison2_meta"))

# kemmeren has no mapping -- gets the missing_value_labels fallback
print("\nkemmeren2 carbon_source values:")
print(vdb2.query("SELECT DISTINCT carbon_source FROM kemmeren2_meta"))

# Both views expose the column, enabling cross-dataset queries without COALESCE
print("\ncross-dataset query using carbon_source on both:")
print(vdb2.query("""
    SELECT 'harbison' AS dataset, carbon_source, COUNT(*) AS n
    FROM harbison2_meta GROUP BY carbon_source
    UNION ALL
    SELECT 'kemmeren' AS dataset, carbon_source, COUNT(*) AS n
    FROM kemmeren2_meta GROUP BY carbon_source
    ORDER BY dataset, carbon_source
"""))

p.unlink(missing_ok=True)
minimal_yaml = """
repositories:
  BrentLab/harbison_2004:
    dataset:
      harbison_2004:
        db_name: harbison2
        sample_id:
          field: sample_id
        # harbison has carbon_source mapped via field+path
        carbon_source:
          field: condition
          path: media.carbon_source.compound

  BrentLab/kemmeren_2014:
    dataset:
      kemmeren_2014:
        db_name: kemmeren2
        sample_id:
          field: sample_id
        # kemmeren has NO carbon_source mapping -- fallback will apply

factor_aliases:
  carbon_source:
    glucose: [D-glucose, dextrose, glu]

missing_value_labels:
  carbon_source: unspecified
"""

import tempfile
from pathlib import Path
from tfbpapi.virtual_db import VirtualDB

p = Path(tempfile.mkdtemp()) / "minimal.yaml"
p.write_text(minimal_yaml)
vdb2 = VirtualDB(str(p))

# harbison resolves real values from DataCard definitions
print("harbison2 carbon_source values:")
print(vdb2.query("SELECT DISTINCT carbon_source FROM harbison2_meta"))

# kemmeren has no mapping -- gets the missing_value_labels fallback
print("\nkemmeren2 carbon_source values:")
print(vdb2.query("SELECT DISTINCT carbon_source FROM kemmeren2_meta"))

# Both views expose the column, enabling cross-dataset queries without COALESCE
print("\ncross-dataset query using carbon_source on both:")
print(vdb2.query("""
    SELECT 'harbison' AS dataset, carbon_source, COUNT(*) AS n
    FROM harbison2_meta GROUP BY carbon_source
    UNION ALL
    SELECT 'kemmeren' AS dataset, carbon_source, COUNT(*) AS n
    FROM kemmeren2_meta GROUP BY carbon_source
    ORDER BY dataset, carbon_source
"""))

p.unlink(missing_ok=True)

Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 6316.72it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 6990.51it/s]
Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])
Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])
Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])

harbison2 carbon_source values:
  carbon_source
0   D-raffinose
1   D-galactose
2       glucose
3   unspecified

kemmeren2 carbon_source values:
  carbon_source
0   unspecified

cross-dataset query using carbon_source on both:
    dataset carbon_source     n
0  harbison   D-galactose     4
1  harbison   D-raffinose     1
2  harbison       glucose   310
3  harbison   unspecified    37
4  kemmeren   unspecified  1484

Querying VirtualDB¶

The .query() method executes SQL queries against the registered views. You can write complex SQL queries that join across multiple datasets, filter based on metadata, and aggregate results as needed.

You can also use parameterized queries to safely inject variables into your SQL statements, and prepared statements for repeated queries with different parameters. Named prepared statements can be passed to .prepare() and then executed with .query() with any parameterized values passed in as an arbitrary number of key/value arguments.

In [ ]:

Copied!





# Query the _meta view for sample-level metadata (one row per sample)
# Note: carbon_source is derived from the condition column's DataCard definitions
# with factor aliases already applied (D-glucose -> glucose)
df_meta = vdb.query("SELECT * FROM harbison_meta LIMIT 5")
df_meta
# Query the _meta view for sample-level metadata (one row per sample)
# Note: carbon_source is derived from the condition column's DataCard definitions
# with factor aliases already applied (D-glucose -> glucose)
df_meta = vdb.query("SELECT * FROM harbison_meta LIMIT 5")
df_meta

Out[ ]:

	sample_id	condition	regulator_locus_tag	regulator_symbol	carbon_source	temperature_celsius
0	118	H2O2Hi	YGL073W	HSF1	glucose	30.0
1	216	YPD	YKR064W	OAF3	glucose	30.0
2	314	SM	YOR358W	HAP5	unspecified	30.0
3	330	YPD	YPL177C	CUP9	glucose	30.0
4	9	RAPA	YBL103C	RTG3	glucose	30.0

5. Parameterized Queries¶

Pass keyword arguments to query() and reference them with DuckDB's $name syntax.

In [ ]:

Copied!





# A parameterized query has the following form, where `$reg` is a placeholder
# that gets replaced with the value provided in the `reg` argument.
vdb.query(
    "SELECT * FROM harbison WHERE regulator_symbol = $reg LIMIT 5",
    reg="REB1",
)

# A parameterized query can be saved for future use with the `.prepare()` method
# A parameterized query has the following form, where `$reg` is a placeholder
# that gets replaced with the value provided in the `reg` argument.
vdb.query(
    "SELECT * FROM harbison WHERE regulator_symbol = $reg LIMIT 5",
    reg="REB1",
)

# A parameterized query can be saved for future use with the `.prepare()` method

Out[ ]:

	sample_id	db_id	regulator_locus_tag	regulator_symbol	condition	target_locus_tag	target_symbol	effect	pvalue	carbon_source	temperature_celsius
0	13	12.0	YBR049C	REB1	H2O2Hi	YPR204W	YPR204W	0.901613	0.676943	glucose	30.0
1	13	12.0	YBR049C	REB1	H2O2Hi	YPR203W	YPR203W	1.053452	0.384404	glucose	30.0
2	13	12.0	YBR049C	REB1	H2O2Hi	YPR202W	YPR202W	1.053452	0.384404	glucose	30.0
3	13	12.0	YBR049C	REB1	H2O2Hi	YPR201W	ARR3	0.844298	0.665375	glucose	30.0
4	13	12.0	YBR049C	REB1	H2O2Hi	YPR200C	ARR2	0.844298	0.665375	glucose	30.0

Prepared Queries¶

Use prepare() to register a named, reusable query template. Then call it by name via query().

In [ ]:

Copied!





# Register a prepared query
vdb.prepare("glucose_regs", """
    SELECT regulator_symbol, COUNT(*) AS n
    FROM harbison_meta
    WHERE carbon_source = $cs
    GROUP BY regulator_symbol
    HAVING n >= $min_n
    ORDER BY n DESC
""")

# note that rather than a SQL statement, we pass in the name of the prepared query
# and provide the appropriate parameters
vdb.query("glucose_regs", cs="glucose", min_n=2)
# Register a prepared query
vdb.prepare("glucose_regs", """
    SELECT regulator_symbol, COUNT(*) AS n
    FROM harbison_meta
    WHERE carbon_source = $cs
    GROUP BY regulator_symbol
    HAVING n >= $min_n
    ORDER BY n DESC
""")

# note that rather than a SQL statement, we pass in the name of the prepared query
# and provide the appropriate parameters
vdb.query("glucose_regs", cs="glucose", min_n=2)

Out[ ]:

	regulator_symbol	n
0	MSN2	6
1	MSN4	5
2	HSF1	4
3	STE12	4
4	SKN7	4
...	...	...
58	PUT3	2
59	RTG1	2
60	ADR1	2
61	UGA3	2
62	PDR1	2

63 rows × 2 columns

7. Comparative Dataset Views¶

Comparative datasets (those with links) get an extra view type:

<name>_expanded: For each composite ID field, adds two parsed columns:

<link_field>_source -- the source dataset, aliased to db_name when the repo_id;config_name pair is in the VirtualDB config.
<link_field>_id -- the sample_id component.

This makes it easy to join or filter by source dataset without manually parsing composite IDs.

In [ ]:

Copied!

# The expanded view has parsed _source and _id columns for each link field
vdb.query("SELECT * FROM dto_expanded LIMIT 3")
# The expanded view has parsed _source and _id columns for each link field
vdb.query("SELECT * FROM dto_expanded LIMIT 3")

Out[ ]:

	binding_id	perturbation_id	binding_rank_threshold	perturbation_rank_threshold	binding_set_size	perturbation_set_size	dto_fdr	dto_empirical_pvalue	pr_ranking_column	binding_repo_dataset	perturbation_repo_dataset	binding_id_id	binding_id_source	perturbation_id_id	perturbation_id_source
0	BrentLab/harbison_2004;harbison_2004;105	BrentLab/hughes_2006;overexpression;10	11.0	206.0	12.0	206.0	0.041293	0.017	log2fc	harbison_2004-harbison_2004	hughes_2006-overexpression	105	harbison	10	BrentLab/hughes_2006;overexpression
1	BrentLab/harbison_2004;harbison_2004;108	BrentLab/hughes_2006;overexpression;11	60.0	67.0	60.0	67.0	0.054284	0.000	log2fc	harbison_2004-harbison_2004	hughes_2006-overexpression	108	harbison	11	BrentLab/hughes_2006;overexpression
2	BrentLab/harbison_2004;harbison_2004;109	BrentLab/hughes_2006;overexpression;11	27.0	1265.0	27.0	1265.0	0.123214	0.057	log2fc	harbison_2004-harbison_2004	hughes_2006-overexpression	109	harbison	11	BrentLab/hughes_2006;overexpression

In [ ]:

Copied!





# Join harbison metadata to dto via the expanded view's parsed columns
vdb.query("""
    SELECT h.*, d.dto_empirical_pvalue, d.dto_fdr
    FROM harbison_meta h
    JOIN dto_expanded d
        ON CAST(h.sample_id AS VARCHAR) = d.binding_id_id
        AND d.binding_id_source = 'harbison'
    WHERE d.dto_empirical_pvalue <= 0.01
    ORDER BY d.dto_empirical_pvalue
    LIMIT 10
""")
# Join harbison metadata to dto via the expanded view's parsed columns
vdb.query("""
    SELECT h.*, d.dto_empirical_pvalue, d.dto_fdr
    FROM harbison_meta h
    JOIN dto_expanded d
        ON CAST(h.sample_id AS VARCHAR) = d.binding_id_id
        AND d.binding_id_source = 'harbison'
    WHERE d.dto_empirical_pvalue <= 0.01
    ORDER BY d.dto_empirical_pvalue
    LIMIT 10
""")

Out[ ]:

	sample_id	condition	regulator_locus_tag	regulator_symbol	carbon_source	temperature_celsius	dto_fdr
0	314	SM	YOR358W	HAP5	unspecified	30.0	0.047097
1	240	YPD	YML007W	YAP1	glucose	30.0	0.140913
2	330	YPD	YPL177C	CUP9	glucose	30.0	0.000399
3	114	H2O2Hi	YGL071W	AFT1	glucose	30.0	0.096535
4	118	H2O2Hi	YGL073W	HSF1	glucose	30.0	0.031509
5	31	H2O2Hi	YDL020C	RPN4	glucose	30.0	0.124670
6	303	YPD	YOR028C	CIN5	glucose	30.0	0.036217
7	36	H2O2Lo	YDL056W	MBP1	glucose	30.0	0.043004
8	15	YPD	YBR049C	REB1	glucose	30.0	0.079541
9	162	H2O2Lo	YIL101C	XBP1	glucose	30.0	0.226904

In [ ]:

Copied!





# Cross-dataset join: harbison binding with hackett perturbation data
# via the DTO comparative dataset
vdb.query("""
    SELECT
        h.sample_id        AS harbison_sample_id,
        h.regulator_symbol,
        d.dto_empirical_pvalue,
        d.perturbation_id_id AS hackett_sample_id
    FROM harbison_meta h
    JOIN dto_expanded d
        ON CAST(h.sample_id AS VARCHAR) = d.binding_id_id
        AND d.binding_id_source = 'harbison'
    WHERE d.dto_empirical_pvalue <= 0.01
    ORDER BY d.dto_empirical_pvalue
    LIMIT 10
""")
# Cross-dataset join: harbison binding with hackett perturbation data
# via the DTO comparative dataset
vdb.query("""
    SELECT
        h.sample_id        AS harbison_sample_id,
        h.regulator_symbol,
        d.dto_empirical_pvalue,
        d.perturbation_id_id AS hackett_sample_id
    FROM harbison_meta h
    JOIN dto_expanded d
        ON CAST(h.sample_id AS VARCHAR) = d.binding_id_id
        AND d.binding_id_source = 'harbison'
    WHERE d.dto_empirical_pvalue <= 0.01
    ORDER BY d.dto_empirical_pvalue
    LIMIT 10
""")

Out[ ]:

	harbison_sample_id	regulator_symbol	hackett_sample_id
0	289	DAL82	1213
1	224	ACE2	901
2	283	RAP1	96_238
3	8	RTG3	57
4	75	CAD1	360
5	246	ARG81	1023
6	209	HAP4	802
7	83	GCN4	357
8	55	SWI5	253
9	189	HIR3	772

A realistic example¶

Hackett has multiple experimental conditions that are unique to that dataset. There are some regulators which have replicates within those conditions. We need to find those regulators and design a query which returns only 1 sample per condition set.

In [ ]:

Copied!





# Query hackett to find regulators with multiple samples in the same (time, mechanism)
# condition
vdb.query("""
    SELECT regulator_symbol, time, mechanism, restriction, COUNT(*) AS n
    FROM hackett_meta
    GROUP BY regulator_symbol, time, mechanism, restriction
    HAVING n > 1
    ORDER BY n DESC
""")
# Query hackett to find regulators with multiple samples in the same (time, mechanism)
# condition
vdb.query("""
    SELECT regulator_symbol, time, mechanism, restriction, COUNT(*) AS n
    FROM hackett_meta
    GROUP BY regulator_symbol, time, mechanism, restriction
    HAVING n > 1
    ORDER BY n DESC
""")

Out[ ]:

	regulator_symbol	time	mechanism	restriction	n
0	SWI1	15.0	ZEV	P	3
1	SWI1	30.0	ZEV	P	3
2	SWI1	20.0	ZEV	P	3
3	SWI1	5.0	ZEV	P	3
4	SWI1	90.0	ZEV	P	3
5	SWI1	0.0	ZEV	P	3
6	SWI1	45.0	ZEV	P	3
7	SWI1	10.0	ZEV	P	3
8	MAC1	90.0	GEV	P	2
9	RDS2	20.0	ZEV	P	2
10	MAC1	45.0	GEV	P	2
11	MAC1	15.0	GEV	P	2
12	RDS2	30.0	ZEV	P	2
13	MAC1	30.0	GEV	P	2
14	RDS2	45.0	ZEV	P	2
15	RDS2	15.0	ZEV	P	2
16	MAC1	5.0	GEV	P	2
17	GCN4	15.0	ZEV	P	2
18	RDS2	10.0	ZEV	P	2
19	RDS2	0.0	ZEV	P	2
20	RDS2	90.0	ZEV	P	2
21	GCN4	45.0	ZEV	P	2
22	GCN4	30.0	ZEV	P	2
23	MAC1	0.0	GEV	P	2
24	RDS2	5.0	ZEV	P	2
25	GCN4	90.0	ZEV	P	2
26	GCN4	0.0	ZEV	P	2

In [ ]:

Copied!





# SWI1 has 3 samples at time=20, mechanism=ZEV. Let's look at just those samples
vdb.query("""
    SELECT *
    FROM hackett_meta
    WHERE regulator_symbol = 'SWI1'
      AND time = 20
      AND mechanism = 'ZEV'
""")
# SWI1 has 3 samples at time=20, mechanism=ZEV. Let's look at just those samples
vdb.query("""
    SELECT *
    FROM hackett_meta
    WHERE regulator_symbol = 'SWI1'
      AND time = 20
      AND mechanism = 'ZEV'
""")

Out[ ]:

	sample_id	date	mechanism	regulator_locus_tag	regulator_symbol	restriction	strain	time	carbon_source	temperature_celsius
0	1636	20161117	ZEV	YPL016W	SWI1	P	SMY2266c	20.0	glucose	30.0
1	1620	20161117	ZEV	YPL016W	SWI1	P	SMY2266a	20.0	glucose	30.0
2	1628	20161117	ZEV	YPL016W	SWI1	P	SMY2266b	20.0	glucose	30.0

In [ ]:

Copied!





# In this case, there are three strains with otherwise the same experimental conditions.
# Rather than trying to choose among these right now, we might just want to get a
# unique list of the regulators with replicates in order to exclude them from an
# analysis that doesn't expect replicates.
replicated_hackett_regulators = vdb.query("""
    SELECT DISTINCT regulator_symbol
    FROM hackett_meta
    GROUP BY regulator_symbol, time, mechanism, restriction
    HAVING COUNT(*) > 1
""").regulator_symbol.tolist()
print(replicated_hackett_regulators)
# In this case, there are three strains with otherwise the same experimental conditions.
# Rather than trying to choose among these right now, we might just want to get a
# unique list of the regulators with replicates in order to exclude them from an
# analysis that doesn't expect replicates.
replicated_hackett_regulators = vdb.query("""
    SELECT DISTINCT regulator_symbol
    FROM hackett_meta
    GROUP BY regulator_symbol, time, mechanism, restriction
    HAVING COUNT(*) > 1
""").regulator_symbol.tolist()
print(replicated_hackett_regulators)

['SWI1', 'GCN4', 'RDS2', 'MAC1']

In [ ]:

Copied!

# GEV is another "regulator" we want to exclude
replicated_hackett_regulators.append("GEV")
print(replicated_hackett_regulators)
# GEV is another "regulator" we want to exclude
replicated_hackett_regulators.append("GEV")
print(replicated_hackett_regulators)

['SWI1', 'GCN4', 'RDS2', 'MAC1', 'GEV']

In [ ]:

Copied!

vdb.query("SELECT * FROM dto_expanded")
vdb.query("SELECT * FROM dto_expanded")

Out[ ]:

	binding_id	perturbation_id	binding_rank_threshold	perturbation_rank_threshold	binding_set_size	perturbation_set_size	dto_fdr	dto_empirical_pvalue	pr_ranking_column	binding_repo_dataset	perturbation_repo_dataset	binding_id_id	binding_id_source	perturbation_id_id	perturbation_id_source
0	BrentLab/harbison_2004;harbison_2004;105	BrentLab/hughes_2006;overexpression;10	11.0	206.0	12.0	206.0	0.041293	0.017	log2fc	harbison_2004-harbison_2004	hughes_2006-overexpression	105	harbison	10	BrentLab/hughes_2006;overexpression
1	BrentLab/harbison_2004;harbison_2004;108	BrentLab/hughes_2006;overexpression;11	60.0	67.0	60.0	67.0	0.054284	0.000	log2fc	harbison_2004-harbison_2004	hughes_2006-overexpression	108	harbison	11	BrentLab/hughes_2006;overexpression
2	BrentLab/harbison_2004;harbison_2004;109	BrentLab/hughes_2006;overexpression;11	27.0	1265.0	27.0	1265.0	0.123214	0.057	log2fc	harbison_2004-harbison_2004	hughes_2006-overexpression	109	harbison	11	BrentLab/hughes_2006;overexpression
3	BrentLab/harbison_2004;harbison_2004;112	BrentLab/hughes_2006;overexpression;12	532.0	1093.0	532.0	1093.0	0.436305	0.092	log2fc	harbison_2004-harbison_2004	hughes_2006-overexpression	112	harbison	12	BrentLab/hughes_2006;overexpression
4	BrentLab/harbison_2004;harbison_2004;113	BrentLab/hughes_2006;overexpression;12	10.0	556.0	10.0	556.0	0.017567	0.002	log2fc	harbison_2004-harbison_2004	hughes_2006-overexpression	113	harbison	12	BrentLab/hughes_2006;overexpression
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
29799	BrentLab/callingcards;annotated_features_combi...	BrentLab/kemmeren_2014;kemmeren_2014;784	154.0	905.0	154.0	905.0	0.090665	0.000	pvalue	callingcards-annotated_features_combined	kemmeren_2014-kemmeren_2014	724-692-688	BrentLab/callingcards;annotated_features_combined	784	kemmeren
29800	BrentLab/callingcards;annotated_features_combi...	BrentLab/kemmeren_2014;kemmeren_2014;666	215.0	108.0	215.0	108.0	0.075036	0.005	pvalue	callingcards-annotated_features_combined	kemmeren_2014-kemmeren_2014	725-435-395	BrentLab/callingcards;annotated_features_combined	666	kemmeren
29801	BrentLab/callingcards;annotated_features_combi...	BrentLab/kemmeren_2014;kemmeren_2014;271	221.0	925.0	221.0	925.0	0.403484	0.126	pvalue	callingcards-annotated_features_combined	kemmeren_2014-kemmeren_2014	726-445-424	BrentLab/callingcards;annotated_features_combined	271	kemmeren
29802	BrentLab/callingcards;annotated_features_combi...	BrentLab/kemmeren_2014;kemmeren_2014;1077	281.0	73.0	283.0	77.0	0.095948	0.174	pvalue	callingcards-annotated_features_combined	kemmeren_2014-kemmeren_2014	79-33	BrentLab/callingcards;annotated_features_combined	1077	kemmeren
29803	BrentLab/callingcards;annotated_features_combi...	BrentLab/kemmeren_2014;kemmeren_2014;963	526.0	227.0	527.0	227.0	0.064919	0.000	pvalue	callingcards-annotated_features_combined	kemmeren_2014-kemmeren_2014	96-49	BrentLab/callingcards;annotated_features_combined	963	kemmeren

29804 rows × 15 columns

In [ ]:

Copied!





# We can remove those regulators from our query using a parameterized query
hackett_harbison_dto = vdb.query("""
SELECT h.sample_id, h.regulator_symbol, h.time, h.mechanism,
       dto.*
FROM hackett_meta h
LEFT JOIN (
    SELECT *
    FROM dto_expanded
) AS dto
ON CAST(h.sample_id AS VARCHAR) = dto.perturbation_id_id
WHERE h.regulator_symbol NOT IN $replicated_hacket_regulators
    AND h.mechanism = 'ZEV'
    AND h.restriction = 'P'
    AND h.time = 15
ORDER BY h.regulator_symbol, h.time, h.mechanism
""",
    replicated_hacket_regulators=replicated_hackett_regulators
)
print(hackett_harbison_dto.head())
# We can remove those regulators from our query using a parameterized query
hackett_harbison_dto = vdb.query("""
SELECT h.sample_id, h.regulator_symbol, h.time, h.mechanism,
       dto.*
FROM hackett_meta h
LEFT JOIN (
    SELECT *
    FROM dto_expanded
) AS dto
ON CAST(h.sample_id AS VARCHAR) = dto.perturbation_id_id
WHERE h.regulator_symbol NOT IN $replicated_hacket_regulators
    AND h.mechanism = 'ZEV'
    AND h.restriction = 'P'
    AND h.time = 15
ORDER BY h.regulator_symbol, h.time, h.mechanism
""",
    replicated_hacket_regulators=replicated_hackett_regulators
)
print(hackett_harbison_dto.head())

   sample_id regulator_symbol  time mechanism  \
0        448             ACA1  15.0       ZEV   
1        448             ACA1  15.0       ZEV   
2        448             ACA1  15.0       ZEV   
3        448             ACA1  15.0       ZEV   
4        448             ACA1  15.0       ZEV   

                                          binding_id  \
0       BrentLab/callingcards;annotated_features;803   
1            BrentLab/harbison_2004;harbison_2004;88   
2  BrentLab/mahendrawada_2025;chec_mahendrawada_m...   
3       BrentLab/callingcards;annotated_features;126   
4       BrentLab/callingcards;annotated_features;156   

                          perturbation_id  binding_rank_threshold  \
0  BrentLab/hackett_2020;hackett_2020;448                   110.0   
1  BrentLab/hackett_2020;hackett_2020;448                   334.0   
2  BrentLab/hackett_2020;hackett_2020;448                  3882.0   
3  BrentLab/hackett_2020;hackett_2020;448                   437.0   
4  BrentLab/hackett_2020;hackett_2020;448                   374.0   

   perturbation_rank_threshold  binding_set_size  perturbation_set_size  \
0                        346.0             113.0                  346.0   
1                          1.0             334.0                 5524.0   
2                          1.0            3883.0                 5591.0   
3                          1.0             442.0                 5591.0   
4                          1.0             376.0                 5591.0   

    dto_fdr  dto_empirical_pvalue pr_ranking_column  \
0  0.236207                 0.001            log2fc   
1  0.000000                 1.000            pvalue   
2  0.000000                 1.000            pvalue   
3  0.000000                 1.000            pvalue   
4  0.000000                 1.000            pvalue   

                                binding_repo_dataset  \
0                    callingcards-annotated_features   
1                        harbison_2004-harbison_2004   
2  mahendrawada_2025-chec_mahendrawada_m2025_af_c...   
3                    callingcards-annotated_features   
4                    callingcards-annotated_features   

   perturbation_repo_dataset binding_id_id  \
0  hackett_2020-hackett_2020           803   
1  hackett_2020-hackett_2020            88   
2  hackett_2020-hackett_2020            59   
3  hackett_2020-hackett_2020           126   
4  hackett_2020-hackett_2020           156   

                                   binding_id_source perturbation_id_id  \
0           BrentLab/callingcards;annotated_features                448   
1                                           harbison                448   
2  BrentLab/mahendrawada_2025;chec_mahendrawada_m...                448   
3           BrentLab/callingcards;annotated_features                448   
4           BrentLab/callingcards;annotated_features                448   

  perturbation_id_source  
0                hackett  
1                hackett  
2                hackett  
3                hackett  
4                hackett

In [ ]:

Copied!

# Clean up temp file
temp_config.unlink(missing_ok=True)
# Clean up temp file
temp_config.unlink(missing_ok=True)