Skip to content

Commit a6317b3

Browse files
authored
Fix allocation of non-transient strings in StringStore (#13713)
* Fix bug in memory-zone code when adding non-transient strings. The error could result in segmentation faults or other memory errors during memory zones if new labels were added to the model. * Fix handling of new morphological labels within memory zones. Addresses second issue reported in Memory leak of MorphAnalysis object. #13684
1 parent 3e30b5b commit a6317b3

File tree

6 files changed

+64
-53
lines changed

6 files changed

+64
-53
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ jobs:
5858
fail-fast: true
5959
matrix:
6060
os: [ubuntu-latest, windows-latest, macos-latest]
61-
python_version: ["3.9", "3.11", "3.12"]
61+
python_version: ["3.9", "3.12"]
6262

6363
runs-on: ${{ matrix.os }}
6464

setup.cfg

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ classifiers =
2121
Programming Language :: Python :: 3.10
2222
Programming Language :: Python :: 3.11
2323
Programming Language :: Python :: 3.12
24+
Programming Language :: Python :: 3.13
2425
Topic :: Scientific/Engineering
2526
project_urls =
2627
Release notes = https://github.com/explosion/spaCy/releases
@@ -29,13 +30,13 @@ project_urls =
2930
[options]
3031
zip_safe = false
3132
include_package_data = true
32-
python_requires = >=3.9
33+
python_requires = >=3.9,<3.13
3334
# NOTE: This section is superseded by pyproject.toml and will be removed in
3435
# spaCy v4
3536
setup_requires =
3637
cython>=0.25,<3.0
37-
numpy>=2.0.0,<2.1.0; python_version < "3.9"
38-
numpy>=2.0.0,<2.1.0; python_version >= "3.9"
38+
numpy>=2.0.0,<3.0.0; python_version < "3.9"
39+
numpy>=2.0.0,<3.0.0; python_version >= "3.9"
3940
# We also need our Cython packages here to compile against
4041
cymem>=2.0.2,<2.1.0
4142
preshed>=3.0.2,<3.1.0

spacy/about.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# fmt: off
22
__title__ = "spacy"
3-
__version__ = "3.8.2"
3+
__version__ = "3.8.3"
44
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
55
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

spacy/morphology.pyx

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,16 +57,20 @@ cdef class Morphology:
5757
field_feature_pairs = []
5858
for field in sorted(string_features):
5959
values = string_features[field]
60+
self.strings.add(field, allow_transient=False),
61+
field_id = self.strings[field]
6062
for value in values.split(self.VALUE_SEP):
63+
field_sep_value = field + self.FIELD_SEP + value
64+
self.strings.add(field_sep_value, allow_transient=False),
6165
field_feature_pairs.append((
62-
self.strings.add(field),
63-
self.strings.add(field + self.FIELD_SEP + value),
66+
field_id,
67+
self.strings[field_sep_value]
6468
))
6569
cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
6670
# the hash key for the tag is either the hash of the normalized UFEATS
6771
# string or the hash of an empty placeholder
6872
norm_feats_string = self.normalize_features(features)
69-
tag.key = self.strings.add(norm_feats_string)
73+
tag.key = self.strings.add(norm_feats_string, allow_transient=False)
7074
self.insert(tag)
7175
return tag.key
7276

spacy/strings.pyx

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,8 @@ cdef class StringStore:
222222
internally should not.
223223
RETURNS (uint64): The string's hash value.
224224
"""
225+
if not string:
226+
return 0
225227
if allow_transient is None:
226228
allow_transient = self.mem is not self._non_temp_mem
227229
cdef hash_t str_hash
@@ -383,7 +385,10 @@ cdef class StringStore:
383385
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
384386
if value is not NULL:
385387
return value
386-
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
388+
if allow_transient:
389+
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
390+
else:
391+
value = _allocate(self._non_temp_mem, <unsigned char*>utf8_string, length)
387392
self._map.set(key, value)
388393
if allow_transient and self.mem is not self._non_temp_mem:
389394
self._transient_keys.push_back(key)

spacy/tests/training/test_pretraining.py.disabled

Lines changed: 45 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -264,50 +264,51 @@ def test_pretraining_tagger():
264264
pretrain(filled, tmp_dir)
265265

266266

267-
def test_pretraining_training():
268-
"""Test that training can use a pretrained Tok2Vec model"""
269-
config = Config().from_str(pretrain_string_internal)
270-
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
271-
filled = nlp.config
272-
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
273-
filled = pretrain_config.merge(filled)
274-
train_config = util.load_config(DEFAULT_CONFIG_PATH)
275-
filled = train_config.merge(filled)
276-
with make_tempdir() as tmp_dir:
277-
pretrain_dir = tmp_dir / "pretrain"
278-
pretrain_dir.mkdir()
279-
file_path = write_sample_jsonl(pretrain_dir)
280-
filled["paths"]["raw_text"] = file_path
281-
filled["pretraining"]["component"] = "tagger"
282-
filled["pretraining"]["layer"] = "tok2vec"
283-
train_dir = tmp_dir / "train"
284-
train_dir.mkdir()
285-
train_path, dev_path = write_sample_training(train_dir)
286-
filled["paths"]["train"] = train_path
287-
filled["paths"]["dev"] = dev_path
288-
filled = filled.interpolate()
289-
P = filled["pretraining"]
290-
nlp_base = init_nlp(filled)
291-
model_base = (
292-
nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
293-
)
294-
embed_base = None
295-
for node in model_base.walk():
296-
if node.name == "hashembed":
297-
embed_base = node
298-
pretrain(filled, pretrain_dir)
299-
pretrained_model = Path(pretrain_dir / "model3.bin")
300-
assert pretrained_model.exists()
301-
filled["initialize"]["init_tok2vec"] = str(pretrained_model)
302-
nlp = init_nlp(filled)
303-
model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
304-
embed = None
305-
for node in model.walk():
306-
if node.name == "hashembed":
307-
embed = node
308-
# ensure that the tok2vec weights are actually changed by the pretraining
309-
assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
310-
train(nlp, train_dir)
267+
# Try to debug segfault on windows
268+
#def test_pretraining_training():
269+
# """Test that training can use a pretrained Tok2Vec model"""
270+
# config = Config().from_str(pretrain_string_internal)
271+
# nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
272+
# filled = nlp.config
273+
# pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
274+
# filled = pretrain_config.merge(filled)
275+
# train_config = util.load_config(DEFAULT_CONFIG_PATH)
276+
# filled = train_config.merge(filled)
277+
# with make_tempdir() as tmp_dir:
278+
# pretrain_dir = tmp_dir / "pretrain"
279+
# pretrain_dir.mkdir()
280+
# file_path = write_sample_jsonl(pretrain_dir)
281+
# filled["paths"]["raw_text"] = file_path
282+
# filled["pretraining"]["component"] = "tagger"
283+
# filled["pretraining"]["layer"] = "tok2vec"
284+
# train_dir = tmp_dir / "train"
285+
# train_dir.mkdir()
286+
# train_path, dev_path = write_sample_training(train_dir)
287+
# filled["paths"]["train"] = train_path
288+
# filled["paths"]["dev"] = dev_path
289+
# filled = filled.interpolate()
290+
# P = filled["pretraining"]
291+
# nlp_base = init_nlp(filled)
292+
# model_base = (
293+
# nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
294+
# )
295+
# embed_base = None
296+
# for node in model_base.walk():
297+
# if node.name == "hashembed":
298+
# embed_base = node
299+
# pretrain(filled, pretrain_dir)
300+
# pretrained_model = Path(pretrain_dir / "model3.bin")
301+
# assert pretrained_model.exists()
302+
# filled["initialize"]["init_tok2vec"] = str(pretrained_model)
303+
# nlp = init_nlp(filled)
304+
# model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
305+
# embed = None
306+
# for node in model.walk():
307+
# if node.name == "hashembed":
308+
# embed = node
309+
# # ensure that the tok2vec weights are actually changed by the pretraining
310+
# assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
311+
# train(nlp, train_dir)
311312

312313

313314
def write_sample_jsonl(tmp_dir):

0 commit comments

Comments
 (0)