Skip to content

Commit feeddbb

Browse files
authored
Improve logging (#235)
* WIP: improve logging * Remove unused code * Cut long string, configure via env vars, restructure utils folder * ruff * Fix tests * Update changelog * There is no reason not to test that * Rename * Add tests * Update log message * Ruff * Ruff again * Spelling
1 parent 39a4b73 commit feeddbb

File tree

14 files changed

+245
-26
lines changed

14 files changed

+245
-26
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
### Fixed
1515
- IDs for the Document and Chunk nodes in the lexical graph are now randomly generated and unique across multiple runs, fixing issues in the lexical graph where relationships were created between chunks that were created by different pipeline runs.
16-
16+
- Improve logging for a better debugging experience: long lists and strings are now truncated. The max length can be controlled using the `LOGGING__MAX_LIST_LENGTH` and `LOGGING__MAX_STRING_LENGTH` env variables.
1717

1818
## 1.3.0
1919

examples/build_graph/simple_kg_builder_from_text.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"""
99

1010
import asyncio
11+
import logging
1112

1213
import neo4j
1314
from neo4j_graphrag.embeddings import OpenAIEmbeddings
@@ -20,6 +21,11 @@
2021
from neo4j_graphrag.llm import LLMInterface
2122
from neo4j_graphrag.llm.openai_llm import OpenAILLM
2223

24+
logging.basicConfig()
25+
logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG)
26+
# logging.getLogger("neo4j_graphrag").setLevel(logging.INFO)
27+
28+
2329
# Neo4j db infos
2430
URI = "neo4j://localhost:7687"
2531
AUTH = ("neo4j", "password")

src/neo4j_graphrag/experimental/components/entity_relation_extractor.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from neo4j_graphrag.experimental.pipeline.exceptions import InvalidJSONError
3939
from neo4j_graphrag.generation.prompts import ERExtractionTemplate, PromptTemplate
4040
from neo4j_graphrag.llm import LLMInterface
41+
from neo4j_graphrag.utils.logging import prettify
4142

4243
logger = logging.getLogger(__name__)
4344

@@ -216,25 +217,23 @@ async def extract_for_chunk(
216217
result = json.loads(llm_generated_json)
217218
except (json.JSONDecodeError, InvalidJSONError) as e:
218219
if self.on_error == OnError.RAISE:
219-
raise LLMGenerationError(
220-
f"LLM response is not valid JSON {llm_result.content}: {e}"
221-
)
220+
raise LLMGenerationError("LLM response is not valid JSON") from e
222221
else:
223222
logger.error(
224-
f"LLM response is not valid JSON {llm_result.content} for chunk_index={chunk.index}"
223+
f"LLM response is not valid JSON for chunk_index={chunk.index}"
225224
)
225+
logger.debug(f"Invalid JSON: {llm_result.content}")
226226
result = {"nodes": [], "relationships": []}
227227
try:
228228
chunk_graph = Neo4jGraph(**result)
229229
except ValidationError as e:
230230
if self.on_error == OnError.RAISE:
231-
raise LLMGenerationError(
232-
f"LLM response has improper format {result}: {e}"
233-
)
231+
raise LLMGenerationError("LLM response has improper format") from e
234232
else:
235233
logger.error(
236-
f"LLM response has improper format {result} for chunk_index={chunk.index}"
234+
f"LLM response has improper format for chunk_index={chunk.index}"
237235
)
236+
logger.debug(f"Invalid JSON format: {result}")
238237
chunk_graph = Neo4jGraph()
239238
return chunk_graph
240239

@@ -336,5 +335,5 @@ async def run(
336335
]
337336
chunk_graphs: list[Neo4jGraph] = list(await asyncio.gather(*tasks))
338337
graph = self.combine_chunk_graphs(lexical_graph, chunk_graphs)
339-
logger.debug(f"{self.__class__.__name__}: {graph}")
338+
logger.debug(f"Extracted graph: {prettify(graph)}")
340339
return graph

src/neo4j_graphrag/experimental/pipeline/config/runner.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
from neo4j_graphrag.experimental.pipeline.config.types import PipelineType
4949
from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult
5050
from neo4j_graphrag.experimental.pipeline.types import PipelineDefinition
51+
from neo4j_graphrag.utils.logging import prettify
5152

5253
logger = logging.getLogger(__name__)
5354

@@ -70,6 +71,7 @@ class PipelineConfigWrapper(BaseModel):
7071
] = Field(discriminator=Discriminator(_get_discriminator_value))
7172

7273
def parse(self, resolved_data: dict[str, Any] | None = None) -> PipelineDefinition:
74+
logger.debug("PIPELINE_CONFIG: start parsing config...")
7375
return self.config.parse(resolved_data)
7476

7577
def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]:
@@ -101,10 +103,14 @@ def from_config(
101103
cls, config: AbstractPipelineConfig | dict[str, Any], do_cleaning: bool = False
102104
) -> Self:
103105
wrapper = PipelineConfigWrapper.model_validate({"config": config})
106+
logger.debug(
107+
f"PIPELINE_RUNNER: instantiating Pipeline from config type: {wrapper.config.template_}"
108+
)
104109
return cls(wrapper.parse(), config=wrapper.config, do_cleaning=do_cleaning)
105110

106111
@classmethod
107112
def from_config_file(cls, file_path: Union[str, Path]) -> Self:
113+
logger.info(f"PIPELINE_RUNNER: reading config file from {file_path}")
108114
if not isinstance(file_path, str):
109115
file_path = str(file_path)
110116
data = ConfigReader().read(file_path)
@@ -119,7 +125,7 @@ async def run(self, user_input: dict[str, Any]) -> PipelineResult:
119125
else:
120126
run_param = deep_update(self.run_params, user_input)
121127
logger.info(
122-
f"PIPELINE_RUNNER: starting pipeline {self.pipeline} with run_params={run_param}"
128+
f"PIPELINE_RUNNER: starting pipeline {self.pipeline} with run_params={prettify(run_param)}"
123129
)
124130
result = await self.pipeline.run(data=run_param)
125131
if self.do_cleaning:

src/neo4j_graphrag/experimental/pipeline/pipeline.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
from timeit import default_timer
2525
from typing import Any, AsyncGenerator, Optional
2626

27+
from neo4j_graphrag.utils.logging import prettify
28+
2729
try:
2830
import pygraphviz as pgv
2931
except ImportError:
@@ -90,21 +92,21 @@ async def execute(self, **kwargs: Any) -> RunResult | None:
9092
if the task run successfully, None if the status update
9193
was unsuccessful.
9294
"""
93-
logger.debug(f"Running component {self.name} with {kwargs}")
94-
start_time = default_timer()
9595
component_result = await self.component.run(**kwargs)
9696
run_result = RunResult(
9797
result=component_result,
9898
)
99-
end_time = default_timer()
100-
logger.debug(f"Component {self.name} finished in {end_time - start_time}s")
10199
return run_result
102100

103101
async def run(self, inputs: dict[str, Any]) -> RunResult | None:
104102
"""Main method to execute the task."""
105-
logger.debug(f"TASK START {self.name=} {inputs=}")
103+
logger.debug(f"TASK START {self.name=} input={prettify(inputs)}")
104+
start_time = default_timer()
106105
res = await self.execute(**inputs)
107-
logger.debug(f"TASK RESULT {self.name=} {res=}")
106+
end_time = default_timer()
107+
logger.debug(
108+
f"TASK FINISHED {self.name} in {end_time - start_time} res={prettify(res)}"
109+
)
108110
return res
109111

110112

@@ -141,7 +143,9 @@ async def run_task(self, task: TaskPipelineNode, data: dict[str, Any]) -> None:
141143
try:
142144
await self.set_task_status(task.name, RunStatus.RUNNING)
143145
except PipelineStatusUpdateError:
144-
logger.info(f"Component {task.name} already running or done")
146+
logger.debug(
147+
f"ORCHESTRATOR: TASK ABORTED: {task.name} is already running or done, aborting"
148+
)
145149
return None
146150
res = await task.run(inputs)
147151
await self.set_task_status(task.name, RunStatus.DONE)
@@ -198,7 +202,8 @@ async def check_dependencies_complete(self, task: TaskPipelineNode) -> None:
198202
d_status = await self.get_status_for_component(d.start)
199203
if d_status != RunStatus.DONE:
200204
logger.debug(
201-
f"Missing dependency {d.start} for {task.name} (status: {d_status}). "
205+
f"ORCHESTRATOR {self.run_id}: TASK DELAYED: Missing dependency {d.start} for {task.name} "
206+
f"(status: {d_status}). "
202207
"Will try again when dependency is complete."
203208
)
204209
raise PipelineMissingDependencyError()
@@ -227,6 +232,9 @@ async def next(
227232
await self.check_dependencies_complete(next_node)
228233
except PipelineMissingDependencyError:
229234
continue
235+
logger.debug(
236+
f"ORCHESTRATOR {self.run_id}: enqueuing next task: {next_node.name}"
237+
)
230238
yield next_node
231239
return
232240

@@ -315,7 +323,6 @@ async def run(self, data: dict[str, Any]) -> None:
315323
(node without any parent). Then the callback on_task_complete
316324
will handle the task dependencies.
317325
"""
318-
logger.debug(f"PIPELINE START {data=}")
319326
tasks = [self.run_task(root, data) for root in self.pipeline.roots()]
320327
await asyncio.gather(*tasks)
321328

@@ -624,15 +631,16 @@ def validate_parameter_mapping_for_task(self, task: TaskPipelineNode) -> bool:
624631
return True
625632

626633
async def run(self, data: dict[str, Any]) -> PipelineResult:
627-
logger.debug("Starting pipeline")
634+
logger.debug("PIPELINE START")
628635
start_time = default_timer()
629636
self.invalidate()
630637
self.validate_input_data(data)
631638
orchestrator = Orchestrator(self)
639+
logger.debug(f"PIPELINE ORCHESTRATOR: {orchestrator.run_id}")
632640
await orchestrator.run(data)
633641
end_time = default_timer()
634642
logger.debug(
635-
f"Pipeline {orchestrator.run_id} finished in {end_time - start_time}s"
643+
f"PIPELINE FINISHED {orchestrator.run_id} in {end_time - start_time}s"
636644
)
637645
return PipelineResult(
638646
run_id=orchestrator.run_id,

src/neo4j_graphrag/types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
model_validator,
2727
)
2828

29-
from neo4j_graphrag.utils import validate_search_query_input
29+
from neo4j_graphrag.utils.validation import validate_search_query_input
3030

3131

3232
class RawSearchResult(BaseModel):

src/neo4j_graphrag/utils/__init__.py

Whitespace-only changes.

src/neo4j_graphrag/utils/logging.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Copyright (c) "Neo4j"
2+
# Neo4j Sweden AB [https://neo4j.com]
3+
# #
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
# #
8+
# https://www.apache.org/licenses/LICENSE-2.0
9+
# #
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
from __future__ import annotations
16+
17+
import os
18+
from typing import Any
19+
20+
from pydantic import BaseModel
21+
22+
DEFAULT_MAX_LIST_LENGTH: int = 5
23+
DEFAULT_MAX_STRING_LENGTH: int = 200
24+
25+
26+
class Prettifier:
27+
"""Prettyfy any object for logging.
28+
29+
I.e.: truncate long lists and strings, even nested.
30+
31+
Max list and string length can be configured using env variables:
32+
- LOGGING__MAX_LIST_LENGTH (int)
33+
- LOGGING__MAX_STRING_LENGTH (int)
34+
"""
35+
36+
def __init__(self) -> None:
37+
self.max_list_length = int(
38+
os.environ.get("LOGGING__MAX_LIST_LENGTH", DEFAULT_MAX_LIST_LENGTH)
39+
)
40+
self.max_string_length = int(
41+
os.environ.get("LOGGING__MAX_STRING_LENGTH", DEFAULT_MAX_STRING_LENGTH)
42+
)
43+
44+
def _prettify_dict(self, value: dict[Any, Any]) -> dict[Any, Any]:
45+
return {
46+
k: self(v) # prettyfy each value
47+
for k, v in value.items()
48+
}
49+
50+
def _prettify_list(self, value: list[Any]) -> list[Any]:
51+
items = [
52+
self(v) # prettify each item
53+
for v in value[: self.max_list_length]
54+
]
55+
remaining_items = len(value) - len(items)
56+
if remaining_items > 0:
57+
items.append(f"... ({remaining_items} items)")
58+
return items
59+
60+
def _prettify_str(self, value: str) -> str:
61+
new_value = value[: self.max_string_length]
62+
remaining_chars = len(value) - len(new_value)
63+
if remaining_chars > 0:
64+
new_value += f"... ({remaining_chars} chars)"
65+
return new_value
66+
67+
def __call__(self, value: Any) -> Any:
68+
"""Takes any value and returns a prettified version for logging."""
69+
if isinstance(value, dict):
70+
return self._prettify_dict(value)
71+
if isinstance(value, BaseModel):
72+
return self(value.model_dump())
73+
if isinstance(value, list):
74+
return self._prettify_list(value)
75+
if isinstance(value, str):
76+
return self._prettify_str(value)
77+
return value
78+
79+
80+
prettify = Prettifier()
File renamed without changes.

tests/e2e/test_kg_writer_component_e2e.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ async def test_kg_writer(driver: neo4j.Driver) -> None:
7676
if start_node.embedding_properties: # for mypy
7777
for key, val in start_node.embedding_properties.items():
7878
assert key in node_a.keys()
79-
assert node_a.get(key) == [1.0, 2.0, 3.0]
79+
assert val == node_a.get(key)
8080

8181
node_b = record["b"]
8282
assert end_node.label in list(node_b.labels)

tests/e2e/test_simplekgpipeline_e2e.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import neo4j
2121
import pytest
2222
from neo4j import Driver
23-
2423
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import (
2524
FixedSizeSplitter,
2625
)

tests/unit/experimental/components/test_embedder.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616

1717
import pytest
1818
from neo4j_graphrag.experimental.components.embedder import TextChunkEmbedder
19-
from neo4j_graphrag.experimental.components.types import TextChunk, TextChunks
19+
from neo4j_graphrag.experimental.components.types import (
20+
TextChunk,
21+
TextChunks,
22+
)
2023

2124

2225
@pytest.mark.asyncio

tests/unit/utils/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)