Skip to content

Commit 0ece1ce

Browse files
committed
Specify dm parameters as regular dicts
Conditionals as dicts, repeats are lists.
1 parent f8bc985 commit 0ece1ce

File tree

3 files changed

+78
-36
lines changed

3 files changed

+78
-36
lines changed

src/ephemeris/_config_models.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,17 @@ class RepositoryInstallTargets(BaseModel):
4242
tools: List[RepositoryInstallTarget]
4343

4444

45+
class DictOrValue(BaseModel):
46+
__root__: Union[Dict[str, Union[str, int, float, bool, "DictOrValue"]], Union[str, int, float, bool]]
47+
48+
49+
DictOrValue.update_forward_refs()
50+
51+
4552
class DataManager(BaseModel, extra=Extra.forbid):
4653
tags: List[str]
4754
tool_id: str
48-
parameters: Optional[List[Dict[str, str]]] = None
55+
parameters: Optional[DictOrValue] = None
4956

5057

5158
class DataManagers(BaseModel, extra=Extra.forbid):

src/ephemeris/_idc_split_data_manager_genomes.py

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
by genomes.yml that have already been executed and appear in the target
77
installed data table configuration.
88
"""
9+
import json
910
import logging
1011
import os
1112
import re
@@ -38,6 +39,7 @@
3839
from ._config_models import (
3940
DataManager,
4041
DataManagers,
42+
DictOrValue,
4143
read_data_managers,
4244
)
4345
from .common_parser import get_common_args
@@ -96,8 +98,7 @@ def tool_id_for(indexer: str, data_managers: DataManagers, mode: str) -> str:
9698
class RunDataManager(BaseModel):
9799
id: str
98100
items: Optional[List[Any]] = None
99-
params: Optional[List[Any]] = None
100-
data_table_reload: Optional[List[str]] = None
101+
params: Optional[DictOrValue] = None
101102

102103

103104
class RunDataManagers(BaseModel):
@@ -172,36 +173,34 @@ def walk_over_incomplete_runs(split_options: SplitOptions):
172173
if do_fetch and not split_options.is_build_complete(build_id, fetch_indexer):
173174
log.info(f"Fetching: {build_id}")
174175
fetch_tool_id = tool_id_for(fetch_indexer, data_managers, split_options.tool_id_mode)
175-
fetch_params = []
176-
fetch_params.append({"dbkey_source|dbkey_source_selector": "new"})
177-
fetch_params.append({"dbkey_source|dbkey": genome["id"]})
178176
description = genome.get("description")
177+
fetch_params = {
178+
"dbkey_source": {"dbkey_source_selector": "new", "dbkey": genome["id"]},
179+
"sequence_id": genome["id"],
180+
"sequence_name": description,
181+
}
179182
source = genome.get("source")
180183
if source == "ucsc":
181184
if not description:
182-
description = ucsc_description_for_build(genome["id"])
183-
fetch_params.append({"reference_source|reference_source_selector": "ucsc"})
184-
fetch_params.append({"reference_source|requested_dbkey": genome["id"]})
185-
fetch_params.append({"sequence_name": description})
185+
fetch_params["sequence_name"] = ucsc_description_for_build(genome["id"])
186+
fetch_params["reference_source"] = {
187+
"reference_source_selector": "ucsc",
188+
"requested_dbkey": genome["id"],
189+
}
186190
elif re.match("^[A-Z_]+[0-9.]+", source):
187-
fetch_params.append({"reference_source|reference_source_selector": "ncbi"})
188-
fetch_params.append({"reference_source|requested_identifier": source})
189-
fetch_params.append({"sequence_name": genome["description"]})
190-
fetch_params.append({"sequence.id": genome["id"]})
191+
fetch_params["reference_source"] = {
192+
"reference_source_selector": "ncbi",
193+
"requested_identifier": source,
194+
}
191195
elif re.match("^http", source):
192-
fetch_params.append({"reference_source|reference_source_selector": "url"})
193-
fetch_params.append({"reference_source|user_url": source})
194-
fetch_params.append({"sequence_name": genome["description"]})
195-
fetch_params.append({"sequence.id": genome["id"]})
196+
fetch_params["reference_source"] = {"reference_source_selector": "url", "user_url": source}
196197

197198
if description:
198-
fetch_params.append({"dbkey_source|dbkey_name": description})
199+
fetch_params["dbkey_source"]["dbkey_name"] = description
199200

200201
fetch_run_data_manager = RunDataManager(
201202
id=fetch_tool_id,
202203
params=fetch_params,
203-
# Not needed according to Marius
204-
# data_table_reload=["all_fasta", "__dbkeys__"],
205204
)
206205
yield (build_id, fetch_indexer, fetch_run_data_manager)
207206
else:
@@ -223,18 +222,17 @@ def walk_over_incomplete_runs(split_options: SplitOptions):
223222

224223
tool_id = tool_id_for(indexer, data_managers, split_options.tool_id_mode)
225224
data_manager = data_managers.__root__[indexer]
226-
params = data_manager.parameters
225+
params = {}
226+
if data_manager.parameters:
227+
params = json.loads(data_manager.parameters.json()) or {}
228+
genome_params = genome.pop("parameters", None) or {}
229+
params.update(genome_params)
227230
if params is None:
228-
params = [
229-
{"all_fasta_source": "{{ item.id }}"},
230-
{"sequence_name": "{{ item.name }}"},
231-
{"sequence_id": "{{ item.id }}"},
232-
]
233-
# why is this not pulled from the data managers conf? -nate
234-
if re.search("bwa", tool_id):
235-
params.append({"index_algorithm": "bwtsw"})
236-
if re.search("color_space", tool_id):
237-
continue
231+
params = {
232+
"all_fasta_source": "{{ item.id }}",
233+
"sequence_name": "{{ item.name }}",
234+
"sequence_id": "{{ item.id }}",
235+
}
238236

239237
item = deepcopy(genome)
240238
item.pop("indexers", None)

tests/test_split_genomes.py

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,36 @@
4949
- genome
5050
"""
5151

52+
DATA_MANAGER_YAML_WITH_PARAMS = """
53+
the_data_manager:
54+
tool_id: toolshed.g2.bx.psu.edu/repos/iuc/the_data_manager/the_data_manager/0.0.1
55+
parameters:
56+
conditional:
57+
param_a: a
58+
param_b: b
59+
tags:
60+
- dm_tag
61+
"""
62+
63+
GENOMES_WITH_PARAMS = """
64+
genomes:
65+
- dbkey: cat
66+
description: fluffy
67+
id: cat
68+
indexers:
69+
- the_data_manager
70+
parameters:
71+
conditional:
72+
param_c: c
73+
"""
74+
5275

53-
def setup_mock_idc_dir(directory: Path):
76+
def setup_mock_idc_dir(directory: Path, genomes=MERGED_YAML_STR, data_managers=DATA_MANAGER_YAML_STR):
5477
merged = directory / "genomes.yml"
55-
merged.write_text(MERGED_YAML_STR)
78+
merged.write_text(genomes)
5679

57-
data_managers = directory / "data_managers.yml"
58-
data_managers.write_text(DATA_MANAGER_YAML_STR)
80+
data_managers_path = directory / "data_managers.yml"
81+
data_managers_path.write_text(data_managers)
5982

6083

6184
def read_and_validate_run_data_manager_yaml(path):
@@ -98,6 +121,20 @@ def test_split_genomes(tmp_path: Path):
98121
assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174"
99122

100123

124+
def test_split_genomes_with_params(tmp_path):
125+
setup_mock_idc_dir(tmp_path, GENOMES_WITH_PARAMS, DATA_MANAGER_YAML_WITH_PARAMS)
126+
split_path = tmp_path / "split"
127+
split_options = split_options_for(tmp_path)
128+
split_genomes(split_options)
129+
new_task = split_path / "cat" / "the_data_manager"
130+
new_task_run_yaml = new_task / "run_data_managers.yaml"
131+
run = read_and_validate_run_data_manager_yaml(new_task_run_yaml)
132+
assert len(run.data_managers) == 1
133+
data_manager = run.data_managers[0]
134+
# genome config overwrites data manager config
135+
assert data_manager.params.json() == '{"conditional": {"param_c": "c"}}'
136+
137+
101138
def test_split_genomes_short_ids(tmp_path: Path):
102139
setup_mock_idc_dir(tmp_path)
103140
split_path = tmp_path / "split"

0 commit comments

Comments
 (0)