etal · etal · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/cnvlib/cnary.py b/cnvlib/cnary.py
@@ -292,7 +292,7 @@ def squash_genes(
         self,
         summary_func: Callable = descriptives.biweight_location,
         squash_antitarget: bool = False,
-        ignore: tuple[str, str, str] = params.IGNORE_GENE_NAMES,
+        ignore: tuple[str, ...] = params.IGNORE_GENE_NAMES,
     ) -> CopyNumArray:
         """Combine consecutive bins with the same targeted gene name.
 

diff --git a/cnvlib/plots.py b/cnvlib/plots.py
@@ -285,7 +285,7 @@ def gene_coords_by_name(probes, names):
         chrom = core.check_unique(gene_probes["chromosome"], name)
         # Deduce the unique set of gene names for this region
         uniq_names = set()
-        for oname in set(gene_probes["gene"]):
+        for oname in gene_probes["gene"].dropna().unique():
             uniq_names.update(oname.split(","))
         all_coords[chrom][start, end].update(uniq_names)
     # Consolidate each region's gene names into a string
@@ -310,7 +310,9 @@ def gene_coords_by_range(probes, chrom, start, end, ignore=params.IGNORE_GENE_NA
     # Tabulate the genes in the selected region
     genes: dict = collections.OrderedDict()
     for row in probes.in_range(chrom, start, end):
-        name = str(row.gene)
+        if not isinstance(row.gene, str):
+            continue
+        name = row.gene
         if name in genes:
             genes[name][1] = row.end
         elif name not in ignore:

diff --git a/cnvlib/reports.py b/cnvlib/reports.py
@@ -62,7 +62,7 @@ def do_breaks(
 
 
 def get_gene_intervals(
-    all_probes: CopyNumArray, ignore: tuple[str, str, str] = params.IGNORE_GENE_NAMES
+    all_probes: CopyNumArray, ignore: tuple[str, ...] = params.IGNORE_GENE_NAMES
 ) -> collections.defaultdict[str, list[tuple[str, list[int], int]]]:
     """Tally genomic locations of each targeted gene.
 
@@ -71,11 +71,13 @@ def get_gene_intervals(
     positions, and end is the last probe's end position as an integer. (The
     endpoints are redundant since probes are adjacent.)
     """
-    ignore += params.ANTITARGET_ALIASES  # type: ignore[assignment]
+    ignore += params.ANTITARGET_ALIASES
     # Tally the start & end points for each targeted gene; group by chromosome
     gene_probes: dict = collections.defaultdict(lambda: collections.defaultdict(list))
     for row in all_probes:
-        gname = str(row.gene)
+        if not isinstance(row.gene, str):
+            continue
+        gname = row.gene
         if gname not in ignore:
             gene_probes[row.chromosome][gname].append(row)
     # Condense into a single interval for each gene
@@ -427,7 +429,7 @@ def group_by_genes(
 
         [(gene, chrom, start, end, [coverages]), ...]
     """
-    ignore = ("", np.nan, *params.ANTITARGET_ALIASES)
+    ignore = ("", *params.ANTITARGET_ALIASES)
     for gene, rows in cnarr.by_gene():
         if not rows or gene in ignore:
             continue

diff --git a/cnvlib/rna.py b/cnvlib/rna.py
@@ -491,7 +491,8 @@ def attach_gene_info_to_cnr(sample_counts, sample_data_log2, gene_info, read_len
     Split out samples to individual .cnr files, keeping (most) gene info.
     """
     gi_cols = ["chromosome", "start", "end", "gene", "gc", "tx_length", "weight"]
-    cnr_info = gene_info.loc[:, gi_cols]
+    cnr_info = gene_info.loc[:, gi_cols].copy()
+    cnr_info["gene"] = cnr_info["gene"].fillna("-")
     # Fill NA fields with the lowest finite value in the same row.
     # Only use NULL_LOG2_COVERAGE if all samples are NA / zero-depth.
     gene_minima = sample_data_log2.min(axis=1, skipna=True)

diff --git a/cnvlib/segfilters.py b/cnvlib/segfilters.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 
+from skgenome.combiners import join_strings
 from .descriptives import weighted_median
 from typing import TYPE_CHECKING
 
@@ -121,7 +122,7 @@ def _wavg(col: str) -> float:
         "end": cnarr["end"].iat[-1],
     }
     out["log2"] = _wavg("log2")
-    out["gene"] = ",".join(cnarr["gene"].drop_duplicates())
+    out["gene"] = join_strings(cnarr["gene"])
     out["probes"] = cnarr["probes"].sum() if "probes" in cnarr else len(cnarr)
     out["weight"] = region_weight
     if "depth" in cnarr:

diff --git a/cnvlib/segmentation/__init__.py b/cnvlib/segmentation/__init__.py
@@ -10,6 +10,7 @@
 import numpy as np
 import pandas as pd
 from skgenome import tabio
+from skgenome.combiners import join_strings
 from skgenome.intersect import iter_slices
 
 from .. import core, parallel, params, smoothing
@@ -336,7 +337,7 @@ def drop_outliers(cnarr: CNA, width: int, factor: int) -> CNA:
 
 
 def transfer_fields(
-    segments: CNA, cnarr: CNA, ignore: tuple[str, str, str] = params.IGNORE_GENE_NAMES
+    segments: CNA, cnarr: CNA, ignore: tuple[str, ...] = params.IGNORE_GENE_NAMES
 ) -> CNA:
     """Map gene names, weights, depths from `cnarr` bins to `segarr` segments.
 
@@ -387,7 +388,7 @@ def make_null_segment(chrom, orig_start, orig_end):
 
     # Aggregate segment depths, weights, gene names
     # ENH refactor so that np/CNA.data access is encapsulated in skgenome
-    ignore += params.ANTITARGET_ALIASES  # type: ignore[assignment]
+    ignore += params.ANTITARGET_ALIASES
     assert bins_chrom == segments.chromosome.iat[0]
     cdata = cnarr.data.reset_index()
     if "depth" not in cdata.columns:
@@ -410,8 +411,7 @@ def make_null_segment(chrom, orig_start, orig_end):
             bin_count = len(cdata.iloc[bin_idx])
             seg_wt = float(bin_count)
             seg_dp = bin_depths[bin_idx].mean()
-        subgenes = [g for g in pd.unique(bin_genes[bin_idx]) if g not in ignore]
-        seg_gn = ",".join(subgenes) if subgenes else "-"
+        seg_gn = join_strings(bin_genes[bin_idx], ignore=ignore)
         seg_genes[i] = seg_gn
         seg_weights[i] = seg_wt
         seg_depths[i] = seg_dp

diff --git a/skgenome/combiners.py b/skgenome/combiners.py
@@ -56,10 +56,31 @@ def last_of(elems: Sequence) -> Any:
 max_of = max
 
 
-def join_strings(elems: Iterable, sep: str = ",") -> str:
-    """Join a Series of strings by commas."""
-    # ENH if elements are also comma-separated, split+uniq those too
-    return sep.join(pd.unique(elems))
+def join_strings(
+    elems: Iterable,
+    sep: str = ",",
+    ignore: tuple[str, ...] = (),
+) -> str:
+    """Join a Series of unique strings, skipping NaN values and ignored names.
+
+    Parameters
+    ----------
+    elems : iterable
+        Values to join. Non-string elements (e.g. NaN) are silently skipped.
+    sep : str
+        Separator between joined names.
+    ignore : tuple of str
+        String values to exclude from the result (e.g. placeholder gene names).
+
+    Returns
+    -------
+    str
+        The joined string, or ``"-"`` if no valid strings remain.
+    """
+    unique_strs = dict.fromkeys(
+        e for e in elems if isinstance(e, str) and e not in ignore
+    )
+    return sep.join(unique_strs) or "-"
 
 
 def merge_strands(elems: Sequence) -> str:

diff --git a/test/test_cnvlib.py b/test/test_cnvlib.py
@@ -13,7 +13,7 @@
 from skgenome import GenomicArray, tabio
 
 import cnvlib
-from cnvlib import cnary, fix, params, segmetrics
+from cnvlib import cnary, fix, params, segfilters, segmentation, segmetrics
 from conftest import linecount
 
 
@@ -360,6 +360,68 @@ def test_apply_weights_no_nan(self):
         result = fix.apply_weights(cnarr, ref_nan, "log2", "spread")
         self.assertFalse(np.isnan(result["weight"]).any())
 
+    def test_transfer_fields_nan_gene(self):
+        """transfer_fields handles NaN gene names without crashing (issue #900)."""
+        n = 10
+        cnarr = cnary.CopyNumArray(
+            pd.DataFrame(
+                {
+                    "chromosome": ["chr1"] * n,
+                    "start": np.arange(0, n * 1000, 1000),
+                    "end": np.arange(1000, n * 1000 + 1000, 1000),
+                    "gene": ["GeneA", float("nan"), "GeneB"] * 3 + [float("nan")],
+                    "log2": np.zeros(n),
+                    "depth": np.ones(n) * 100.0,
+                    "weight": np.ones(n),
+                }
+            )
+        )
+        # One segment covering all bins
+        segarr = cnary.CopyNumArray(
+            pd.DataFrame(
+                {
+                    "chromosome": ["chr1"],
+                    "start": [0],
+                    "end": [n * 1000],
+                    "gene": ["-"],
+                    "log2": [0.0],
+                    "probes": [n],
+                    "weight": [0.0],
+                }
+            )
+        )
+        result = segmentation.transfer_fields(segarr, cnarr)
+        gene_val = result["gene"].iat[0]
+        self.assertIsInstance(gene_val, str)
+        self.assertNotIn("nan", gene_val.lower())
+        self.assertIn("GeneA", gene_val)
+        self.assertIn("GeneB", gene_val)
+
+    def test_squash_region_nan_gene(self):
+        """squash_region handles NaN gene names without crashing (issue #900)."""
+        df = pd.DataFrame(
+            {
+                "chromosome": ["chr1", "chr1"],
+                "start": [0, 1000],
+                "end": [1000, 2000],
+                "gene": ["GeneA", float("nan")],
+                "log2": [0.1, 0.2],
+                "probes": [5, 5],
+                "weight": [1.0, 1.0],
+            }
+        )
+        result = segfilters.squash_region(df)
+        gene_val = result["gene"].iat[0]
+        self.assertIsInstance(gene_val, str)
+        self.assertNotIn("nan", gene_val.lower())
+        self.assertEqual(gene_val, "GeneA")
+
+        # All-NaN genes should produce the placeholder "-"
+        df_allnan = df.copy()
+        df_allnan["gene"] = [float("nan"), float("nan")]
+        result2 = segfilters.squash_region(df_allnan)
+        self.assertEqual(result2["gene"].iat[0], "-")
+
 
 class VATests(unittest.TestCase):
     """Tests for the VariantArray class."""