Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cnvlib/cnary.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def squash_genes(
self,
summary_func: Callable = descriptives.biweight_location,
squash_antitarget: bool = False,
ignore: tuple[str, str, str] = params.IGNORE_GENE_NAMES,
ignore: tuple[str, ...] = params.IGNORE_GENE_NAMES,
) -> CopyNumArray:
"""Combine consecutive bins with the same targeted gene name.

Expand Down
6 changes: 4 additions & 2 deletions cnvlib/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def gene_coords_by_name(probes, names):
chrom = core.check_unique(gene_probes["chromosome"], name)
# Deduce the unique set of gene names for this region
uniq_names = set()
for oname in set(gene_probes["gene"]):
for oname in gene_probes["gene"].dropna().unique():
uniq_names.update(oname.split(","))
all_coords[chrom][start, end].update(uniq_names)
# Consolidate each region's gene names into a string
Expand All @@ -310,7 +310,9 @@ def gene_coords_by_range(probes, chrom, start, end, ignore=params.IGNORE_GENE_NA
# Tabulate the genes in the selected region
genes: dict = collections.OrderedDict()
for row in probes.in_range(chrom, start, end):
name = str(row.gene)
if not isinstance(row.gene, str):
continue
name = row.gene
if name in genes:
genes[name][1] = row.end
elif name not in ignore:
Expand Down
10 changes: 6 additions & 4 deletions cnvlib/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def do_breaks(


def get_gene_intervals(
all_probes: CopyNumArray, ignore: tuple[str, str, str] = params.IGNORE_GENE_NAMES
all_probes: CopyNumArray, ignore: tuple[str, ...] = params.IGNORE_GENE_NAMES
) -> collections.defaultdict[str, list[tuple[str, list[int], int]]]:
"""Tally genomic locations of each targeted gene.

Expand All @@ -71,11 +71,13 @@ def get_gene_intervals(
positions, and end is the last probe's end position as an integer. (The
endpoints are redundant since probes are adjacent.)
"""
ignore += params.ANTITARGET_ALIASES # type: ignore[assignment]
ignore += params.ANTITARGET_ALIASES
# Tally the start & end points for each targeted gene; group by chromosome
gene_probes: dict = collections.defaultdict(lambda: collections.defaultdict(list))
for row in all_probes:
gname = str(row.gene)
if not isinstance(row.gene, str):
continue
gname = row.gene
if gname not in ignore:
gene_probes[row.chromosome][gname].append(row)
# Condense into a single interval for each gene
Expand Down Expand Up @@ -427,7 +429,7 @@ def group_by_genes(

[(gene, chrom, start, end, [coverages]), ...]
"""
ignore = ("", np.nan, *params.ANTITARGET_ALIASES)
ignore = ("", *params.ANTITARGET_ALIASES)
for gene, rows in cnarr.by_gene():
if not rows or gene in ignore:
continue
Expand Down
3 changes: 2 additions & 1 deletion cnvlib/rna.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,8 @@ def attach_gene_info_to_cnr(sample_counts, sample_data_log2, gene_info, read_len
Split out samples to individual .cnr files, keeping (most) gene info.
"""
gi_cols = ["chromosome", "start", "end", "gene", "gc", "tx_length", "weight"]
cnr_info = gene_info.loc[:, gi_cols]
cnr_info = gene_info.loc[:, gi_cols].copy()
cnr_info["gene"] = cnr_info["gene"].fillna("-")
# Fill NA fields with the lowest finite value in the same row.
# Only use NULL_LOG2_COVERAGE if all samples are NA / zero-depth.
gene_minima = sample_data_log2.min(axis=1, skipna=True)
Expand Down
3 changes: 2 additions & 1 deletion cnvlib/segfilters.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import pandas as pd

from skgenome.combiners import join_strings
from .descriptives import weighted_median
from typing import TYPE_CHECKING

Expand Down Expand Up @@ -121,7 +122,7 @@ def _wavg(col: str) -> float:
"end": cnarr["end"].iat[-1],
}
out["log2"] = _wavg("log2")
out["gene"] = ",".join(cnarr["gene"].drop_duplicates())
out["gene"] = join_strings(cnarr["gene"])
out["probes"] = cnarr["probes"].sum() if "probes" in cnarr else len(cnarr)
out["weight"] = region_weight
if "depth" in cnarr:
Expand Down
8 changes: 4 additions & 4 deletions cnvlib/segmentation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
import pandas as pd
from skgenome import tabio
from skgenome.combiners import join_strings
from skgenome.intersect import iter_slices

from .. import core, parallel, params, smoothing
Expand Down Expand Up @@ -336,7 +337,7 @@ def drop_outliers(cnarr: CNA, width: int, factor: int) -> CNA:


def transfer_fields(
segments: CNA, cnarr: CNA, ignore: tuple[str, str, str] = params.IGNORE_GENE_NAMES
segments: CNA, cnarr: CNA, ignore: tuple[str, ...] = params.IGNORE_GENE_NAMES
) -> CNA:
"""Map gene names, weights, depths from `cnarr` bins to `segarr` segments.

Expand Down Expand Up @@ -387,7 +388,7 @@ def make_null_segment(chrom, orig_start, orig_end):

# Aggregate segment depths, weights, gene names
# ENH refactor so that np/CNA.data access is encapsulated in skgenome
ignore += params.ANTITARGET_ALIASES # type: ignore[assignment]
ignore += params.ANTITARGET_ALIASES
assert bins_chrom == segments.chromosome.iat[0]
cdata = cnarr.data.reset_index()
if "depth" not in cdata.columns:
Expand All @@ -410,8 +411,7 @@ def make_null_segment(chrom, orig_start, orig_end):
bin_count = len(cdata.iloc[bin_idx])
seg_wt = float(bin_count)
seg_dp = bin_depths[bin_idx].mean()
subgenes = [g for g in pd.unique(bin_genes[bin_idx]) if g not in ignore]
seg_gn = ",".join(subgenes) if subgenes else "-"
seg_gn = join_strings(bin_genes[bin_idx], ignore=ignore)
seg_genes[i] = seg_gn
seg_weights[i] = seg_wt
seg_depths[i] = seg_dp
Expand Down
29 changes: 25 additions & 4 deletions skgenome/combiners.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,31 @@ def last_of(elems: Sequence) -> Any:
max_of = max


def join_strings(elems: Iterable, sep: str = ",") -> str:
"""Join a Series of strings by commas."""
# ENH if elements are also comma-separated, split+uniq those too
return sep.join(pd.unique(elems))
def join_strings(
elems: Iterable,
sep: str = ",",
ignore: tuple[str, ...] = (),
) -> str:
"""Join a Series of unique strings, skipping NaN values and ignored names.

Parameters
----------
elems : iterable
Values to join. Non-string elements (e.g. NaN) are silently skipped.
sep : str
Separator between joined names.
ignore : tuple of str
String values to exclude from the result (e.g. placeholder gene names).

Returns
-------
str
The joined string, or ``"-"`` if no valid strings remain.
"""
unique_strs = dict.fromkeys(
e for e in elems if isinstance(e, str) and e not in ignore
)
return sep.join(unique_strs) or "-"


def merge_strands(elems: Sequence) -> str:
Expand Down
64 changes: 63 additions & 1 deletion test/test_cnvlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from skgenome import GenomicArray, tabio

import cnvlib
from cnvlib import cnary, fix, params, segmetrics
from cnvlib import cnary, fix, params, segfilters, segmentation, segmetrics
from conftest import linecount


Expand Down Expand Up @@ -360,6 +360,68 @@ def test_apply_weights_no_nan(self):
result = fix.apply_weights(cnarr, ref_nan, "log2", "spread")
self.assertFalse(np.isnan(result["weight"]).any())

def test_transfer_fields_nan_gene(self):
"""transfer_fields handles NaN gene names without crashing (issue #900)."""
n = 10
cnarr = cnary.CopyNumArray(
pd.DataFrame(
{
"chromosome": ["chr1"] * n,
"start": np.arange(0, n * 1000, 1000),
"end": np.arange(1000, n * 1000 + 1000, 1000),
"gene": ["GeneA", float("nan"), "GeneB"] * 3 + [float("nan")],
"log2": np.zeros(n),
"depth": np.ones(n) * 100.0,
"weight": np.ones(n),
}
)
)
# One segment covering all bins
segarr = cnary.CopyNumArray(
pd.DataFrame(
{
"chromosome": ["chr1"],
"start": [0],
"end": [n * 1000],
"gene": ["-"],
"log2": [0.0],
"probes": [n],
"weight": [0.0],
}
)
)
result = segmentation.transfer_fields(segarr, cnarr)
gene_val = result["gene"].iat[0]
self.assertIsInstance(gene_val, str)
self.assertNotIn("nan", gene_val.lower())
self.assertIn("GeneA", gene_val)
self.assertIn("GeneB", gene_val)

def test_squash_region_nan_gene(self):
"""squash_region handles NaN gene names without crashing (issue #900)."""
df = pd.DataFrame(
{
"chromosome": ["chr1", "chr1"],
"start": [0, 1000],
"end": [1000, 2000],
"gene": ["GeneA", float("nan")],
"log2": [0.1, 0.2],
"probes": [5, 5],
"weight": [1.0, 1.0],
}
)
result = segfilters.squash_region(df)
gene_val = result["gene"].iat[0]
self.assertIsInstance(gene_val, str)
self.assertNotIn("nan", gene_val.lower())
self.assertEqual(gene_val, "GeneA")

# All-NaN genes should produce the placeholder "-"
df_allnan = df.copy()
df_allnan["gene"] = [float("nan"), float("nan")]
result2 = segfilters.squash_region(df_allnan)
self.assertEqual(result2["gene"].iat[0], "-")


class VATests(unittest.TestCase):
"""Tests for the VariantArray class."""
Expand Down
Loading