diff --git a/cnvlib/cnary.py b/cnvlib/cnary.py index 2f5c4a49..ee54073f 100644 --- a/cnvlib/cnary.py +++ b/cnvlib/cnary.py @@ -292,7 +292,7 @@ def squash_genes( self, summary_func: Callable = descriptives.biweight_location, squash_antitarget: bool = False, - ignore: tuple[str, str, str] = params.IGNORE_GENE_NAMES, + ignore: tuple[str, ...] = params.IGNORE_GENE_NAMES, ) -> CopyNumArray: """Combine consecutive bins with the same targeted gene name. diff --git a/cnvlib/plots.py b/cnvlib/plots.py index eacfabc1..203ca8e6 100644 --- a/cnvlib/plots.py +++ b/cnvlib/plots.py @@ -285,7 +285,7 @@ def gene_coords_by_name(probes, names): chrom = core.check_unique(gene_probes["chromosome"], name) # Deduce the unique set of gene names for this region uniq_names = set() - for oname in set(gene_probes["gene"]): + for oname in gene_probes["gene"].dropna().unique(): uniq_names.update(oname.split(",")) all_coords[chrom][start, end].update(uniq_names) # Consolidate each region's gene names into a string @@ -310,7 +310,9 @@ def gene_coords_by_range(probes, chrom, start, end, ignore=params.IGNORE_GENE_NA # Tabulate the genes in the selected region genes: dict = collections.OrderedDict() for row in probes.in_range(chrom, start, end): - name = str(row.gene) + if not isinstance(row.gene, str): + continue + name = row.gene if name in genes: genes[name][1] = row.end elif name not in ignore: diff --git a/cnvlib/reports.py b/cnvlib/reports.py index ed874cd9..18ef7000 100644 --- a/cnvlib/reports.py +++ b/cnvlib/reports.py @@ -62,7 +62,7 @@ def do_breaks( def get_gene_intervals( - all_probes: CopyNumArray, ignore: tuple[str, str, str] = params.IGNORE_GENE_NAMES + all_probes: CopyNumArray, ignore: tuple[str, ...] = params.IGNORE_GENE_NAMES ) -> collections.defaultdict[str, list[tuple[str, list[int], int]]]: """Tally genomic locations of each targeted gene. @@ -71,11 +71,13 @@ def get_gene_intervals( positions, and end is the last probe's end position as an integer. (The endpoints are redundant since probes are adjacent.) """ - ignore += params.ANTITARGET_ALIASES # type: ignore[assignment] + ignore += params.ANTITARGET_ALIASES # Tally the start & end points for each targeted gene; group by chromosome gene_probes: dict = collections.defaultdict(lambda: collections.defaultdict(list)) for row in all_probes: - gname = str(row.gene) + if not isinstance(row.gene, str): + continue + gname = row.gene if gname not in ignore: gene_probes[row.chromosome][gname].append(row) # Condense into a single interval for each gene @@ -427,7 +429,7 @@ def group_by_genes( [(gene, chrom, start, end, [coverages]), ...] """ - ignore = ("", np.nan, *params.ANTITARGET_ALIASES) + ignore = ("", *params.ANTITARGET_ALIASES) for gene, rows in cnarr.by_gene(): if not rows or gene in ignore: continue diff --git a/cnvlib/rna.py b/cnvlib/rna.py index 26c0be43..1fc06dc4 100644 --- a/cnvlib/rna.py +++ b/cnvlib/rna.py @@ -491,7 +491,8 @@ def attach_gene_info_to_cnr(sample_counts, sample_data_log2, gene_info, read_len Split out samples to individual .cnr files, keeping (most) gene info. """ gi_cols = ["chromosome", "start", "end", "gene", "gc", "tx_length", "weight"] - cnr_info = gene_info.loc[:, gi_cols] + cnr_info = gene_info.loc[:, gi_cols].copy() + cnr_info["gene"] = cnr_info["gene"].fillna("-") # Fill NA fields with the lowest finite value in the same row. # Only use NULL_LOG2_COVERAGE if all samples are NA / zero-depth. gene_minima = sample_data_log2.min(axis=1, skipna=True) diff --git a/cnvlib/segfilters.py b/cnvlib/segfilters.py index ce40b15c..c76d2cfb 100644 --- a/cnvlib/segfilters.py +++ b/cnvlib/segfilters.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +from skgenome.combiners import join_strings from .descriptives import weighted_median from typing import TYPE_CHECKING @@ -121,7 +122,7 @@ def _wavg(col: str) -> float: "end": cnarr["end"].iat[-1], } out["log2"] = _wavg("log2") - out["gene"] = ",".join(cnarr["gene"].drop_duplicates()) + out["gene"] = join_strings(cnarr["gene"]) out["probes"] = cnarr["probes"].sum() if "probes" in cnarr else len(cnarr) out["weight"] = region_weight if "depth" in cnarr: diff --git a/cnvlib/segmentation/__init__.py b/cnvlib/segmentation/__init__.py index 3fb685da..49a7a698 100644 --- a/cnvlib/segmentation/__init__.py +++ b/cnvlib/segmentation/__init__.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd from skgenome import tabio +from skgenome.combiners import join_strings from skgenome.intersect import iter_slices from .. import core, parallel, params, smoothing @@ -336,7 +337,7 @@ def drop_outliers(cnarr: CNA, width: int, factor: int) -> CNA: def transfer_fields( - segments: CNA, cnarr: CNA, ignore: tuple[str, str, str] = params.IGNORE_GENE_NAMES + segments: CNA, cnarr: CNA, ignore: tuple[str, ...] = params.IGNORE_GENE_NAMES ) -> CNA: """Map gene names, weights, depths from `cnarr` bins to `segarr` segments. @@ -387,7 +388,7 @@ def make_null_segment(chrom, orig_start, orig_end): # Aggregate segment depths, weights, gene names # ENH refactor so that np/CNA.data access is encapsulated in skgenome - ignore += params.ANTITARGET_ALIASES # type: ignore[assignment] + ignore += params.ANTITARGET_ALIASES assert bins_chrom == segments.chromosome.iat[0] cdata = cnarr.data.reset_index() if "depth" not in cdata.columns: @@ -410,8 +411,7 @@ def make_null_segment(chrom, orig_start, orig_end): bin_count = len(cdata.iloc[bin_idx]) seg_wt = float(bin_count) seg_dp = bin_depths[bin_idx].mean() - subgenes = [g for g in pd.unique(bin_genes[bin_idx]) if g not in ignore] - seg_gn = ",".join(subgenes) if subgenes else "-" + seg_gn = join_strings(bin_genes[bin_idx], ignore=ignore) seg_genes[i] = seg_gn seg_weights[i] = seg_wt seg_depths[i] = seg_dp diff --git a/skgenome/combiners.py b/skgenome/combiners.py index df384572..58710612 100644 --- a/skgenome/combiners.py +++ b/skgenome/combiners.py @@ -56,10 +56,31 @@ def last_of(elems: Sequence) -> Any: max_of = max -def join_strings(elems: Iterable, sep: str = ",") -> str: - """Join a Series of strings by commas.""" - # ENH if elements are also comma-separated, split+uniq those too - return sep.join(pd.unique(elems)) +def join_strings( + elems: Iterable, + sep: str = ",", + ignore: tuple[str, ...] = (), +) -> str: + """Join a Series of unique strings, skipping NaN values and ignored names. + + Parameters + ---------- + elems : iterable + Values to join. Non-string elements (e.g. NaN) are silently skipped. + sep : str + Separator between joined names. + ignore : tuple of str + String values to exclude from the result (e.g. placeholder gene names). + + Returns + ------- + str + The joined string, or ``"-"`` if no valid strings remain. + """ + unique_strs = dict.fromkeys( + e for e in elems if isinstance(e, str) and e not in ignore + ) + return sep.join(unique_strs) or "-" def merge_strands(elems: Sequence) -> str: diff --git a/test/test_cnvlib.py b/test/test_cnvlib.py index f6579bd4..52263280 100755 --- a/test/test_cnvlib.py +++ b/test/test_cnvlib.py @@ -13,7 +13,7 @@ from skgenome import GenomicArray, tabio import cnvlib -from cnvlib import cnary, fix, params, segmetrics +from cnvlib import cnary, fix, params, segfilters, segmentation, segmetrics from conftest import linecount @@ -360,6 +360,68 @@ def test_apply_weights_no_nan(self): result = fix.apply_weights(cnarr, ref_nan, "log2", "spread") self.assertFalse(np.isnan(result["weight"]).any()) + def test_transfer_fields_nan_gene(self): + """transfer_fields handles NaN gene names without crashing (issue #900).""" + n = 10 + cnarr = cnary.CopyNumArray( + pd.DataFrame( + { + "chromosome": ["chr1"] * n, + "start": np.arange(0, n * 1000, 1000), + "end": np.arange(1000, n * 1000 + 1000, 1000), + "gene": ["GeneA", float("nan"), "GeneB"] * 3 + [float("nan")], + "log2": np.zeros(n), + "depth": np.ones(n) * 100.0, + "weight": np.ones(n), + } + ) + ) + # One segment covering all bins + segarr = cnary.CopyNumArray( + pd.DataFrame( + { + "chromosome": ["chr1"], + "start": [0], + "end": [n * 1000], + "gene": ["-"], + "log2": [0.0], + "probes": [n], + "weight": [0.0], + } + ) + ) + result = segmentation.transfer_fields(segarr, cnarr) + gene_val = result["gene"].iat[0] + self.assertIsInstance(gene_val, str) + self.assertNotIn("nan", gene_val.lower()) + self.assertIn("GeneA", gene_val) + self.assertIn("GeneB", gene_val) + + def test_squash_region_nan_gene(self): + """squash_region handles NaN gene names without crashing (issue #900).""" + df = pd.DataFrame( + { + "chromosome": ["chr1", "chr1"], + "start": [0, 1000], + "end": [1000, 2000], + "gene": ["GeneA", float("nan")], + "log2": [0.1, 0.2], + "probes": [5, 5], + "weight": [1.0, 1.0], + } + ) + result = segfilters.squash_region(df) + gene_val = result["gene"].iat[0] + self.assertIsInstance(gene_val, str) + self.assertNotIn("nan", gene_val.lower()) + self.assertEqual(gene_val, "GeneA") + + # All-NaN genes should produce the placeholder "-" + df_allnan = df.copy() + df_allnan["gene"] = [float("nan"), float("nan")] + result2 = segfilters.squash_region(df_allnan) + self.assertEqual(result2["gene"].iat[0], "-") + class VATests(unittest.TestCase): """Tests for the VariantArray class."""