Skip to content

Commit 8eb524e

Browse files
committed
[gephi-lite] Improves field types inference
Details: - Adds possibility in parseFile to specify known field types - Specifies field types for nodes labels and z values and edge label in parseFile's GEXF section - Separates heuristics from category and keyword fields, to improve keyword fields detection
1 parent 795958b commit 8eb524e

File tree

4 files changed

+73
-55
lines changed

4 files changed

+73
-55
lines changed

packages/gephi-lite/src/core/file/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ export const open = asyncAction(async (file: FileTypeWithoutFormat) => {
6565

6666
try {
6767
// Parse the file
68-
const { data, format } = await parseFile(file);
68+
const { data, metadata, format } = await parseFile(file);
6969

7070
// Do the import
7171
resetStates(false);
@@ -76,7 +76,7 @@ export const open = asyncAction(async (file: FileTypeWithoutFormat) => {
7676
const { mergeState } = appearanceActions;
7777
data.setAttribute("title", file.filename);
7878

79-
const graphDataset = initializeGraphDataset(data);
79+
const graphDataset = initializeGraphDataset(data, metadata);
8080
setGraphDataset(graphDataset);
8181

8282
const appearanceState = inferAppearanceState(graphDataset);

packages/gephi-lite/src/core/file/utils.ts

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { gephiLiteParse } from "@gephi/gephi-lite-sdk";
1+
import { FieldModel, gephiLiteParse } from "@gephi/gephi-lite-sdk";
22
import Graph from "graphology";
33
import gexf from "graphology-gexf/browser";
44
import graphml from "graphology-graphml/browser";
@@ -51,10 +51,13 @@ async function getFileContent(file: FileTypeWithoutFormat): Promise<string> {
5151
/**
5252
* Parse the content of the given file and returns its data and its type.
5353
*/
54-
export async function parseFile(
55-
file: FileTypeWithoutFormat,
56-
): Promise<
57-
{ format: "gexf" | "graphml" | "graphology"; data: Graph } | { format: "gephi-lite"; data: GephiLiteFileFormat }
54+
export async function parseFile(file: FileTypeWithoutFormat): Promise<
55+
| {
56+
format: "gexf" | "graphml" | "graphology";
57+
data: Graph;
58+
metadata?: { nodeFields?: FieldModel<"nodes">[]; edgeFields?: FieldModel<"edges">[] };
59+
}
60+
| { format: "gephi-lite"; data: GephiLiteFileFormat; metadata?: undefined }
5861
> {
5962
const content = await getFileContent(file);
6063
const extension = (file.filename.split(".").pop() || "").toLowerCase();
@@ -65,6 +68,27 @@ export async function parseFile(
6568
return {
6669
format: "gexf",
6770
data: gexf.parse(Graph, content, { allowUndeclaredAttributes: true, addMissingNodes: true }),
71+
metadata: {
72+
nodeFields: [
73+
{
74+
id: "label",
75+
itemType: "nodes",
76+
type: "text",
77+
},
78+
{
79+
id: "z",
80+
itemType: "nodes",
81+
type: "number",
82+
},
83+
],
84+
edgeFields: [
85+
{
86+
id: "label",
87+
itemType: "edges",
88+
type: "text",
89+
},
90+
],
91+
},
6892
};
6993
case "graphml":
7094
return {

packages/gephi-lite/src/core/graph/fieldModel.ts

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import {
1414
toStringArray,
1515
} from "@gephi/gephi-lite-sdk";
1616
import guessFormat from "@gristlabs/moment-guess";
17-
import { isNumber, sortBy, take, toPairs, uniq } from "lodash";
17+
import { countBy, isNumber, mean, size, sortBy, take, toPairs, uniq } from "lodash";
1818
import { DateTime } from "luxon";
1919

2020
import { isValidColor } from "../../utils/colors";
@@ -38,14 +38,15 @@ export function guessSeparator(values: string[]): string | null {
3838
values.forEach((value) =>
3939
SEPARATORS.forEach((sep) => {
4040
const split = value.split(sep);
41-
if (split.length > 1 && split.every((s) => !!s && !s.match(/(^ | $)/))) separatorsFrequencies[sep]++;
41+
if (split.length > 1 && split.every((s) => !!s.trim())) separatorsFrequencies[sep]++;
4242
}),
4343
);
4444

4545
const bestSeparator = sortBy(
46-
SEPARATORS.filter((sep) => !!separatorsFrequencies[sep]),
46+
SEPARATORS.filter((sep) => separatorsFrequencies[sep] >= values.length / 10),
4747
(sep) => -separatorsFrequencies[sep],
4848
)[0];
49+
4950
return bestSeparator || null;
5051
}
5152

@@ -85,6 +86,7 @@ export function inferFieldType(fieldName: string, values: Scalar[], itemsCount:
8586
values.every((v) => {
8687
try {
8788
const _dateFormat = guessFormat("" + v, "");
89+
8890
// format guesser can return multiple choices, we just pick one
8991
const dateFormat = Array.isArray(_dateFormat) ? _dateFormat[0] : _dateFormat;
9092
const correctedDateFormat = dateFormat.replaceAll("Y", "y").replaceAll("D", "d");
@@ -100,24 +102,27 @@ export function inferFieldType(fieldName: string, values: Scalar[], itemsCount:
100102
return { type: "date", format };
101103
}
102104

103-
// KEYWORDS and CATEGORY
105+
// KEYWORDS
104106
const separator = guessSeparator(
105107
take(
106108
values.map((v) => "" + v),
107109
100,
108110
),
109111
);
110-
const uniqValues = uniq(separator ? values.flatMap((v) => (v + "").split(separator)) : values);
111-
const uniqValuesCount = uniqValues.length;
112+
if (separator) {
113+
const splitValuesCounts = countBy(values.flatMap((v) => (v + "").split(separator)));
114+
const uniqSplitValuesCount = size(splitValuesCounts);
115+
const averageValuesCount = mean(Object.values(splitValuesCounts));
112116

113-
if (
114-
uniqValuesCount > 1 &&
115-
uniqValuesCount < 50 &&
116-
uniqValuesCount < Math.max(separator ? itemsCount : Math.pow(itemsCount, 0.75), 5)
117-
) {
118-
// category and keywords
119-
if (separator) return { type: "keywords", separator };
120-
else return { type: "category" };
117+
if (averageValuesCount > 2 && uniqSplitValuesCount > 1 && uniqSplitValuesCount < itemsCount) {
118+
return { type: "keywords", separator };
119+
}
120+
}
121+
122+
// CATEGORIES
123+
const uniqValuesCount = uniq(values).length;
124+
if (uniqValuesCount > 1 && uniqValuesCount < Math.max(Math.pow(itemsCount, 0.75), 5)) {
125+
return { type: "category" };
121126
}
122127

123128
// TEXT

packages/gephi-lite/src/core/graph/utils.ts

Lines changed: 23 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { NodeCoordinates, getEmptyGraphDataset, toNumber, toScalar } from "@gephi/gephi-lite-sdk";
22
import Graph, { MultiGraph } from "graphology";
33
import { Attributes } from "graphology-types";
4-
import { flatMap, forEach, isNil, isNumber, keys, mapValues, omit, sortBy, uniq, values } from "lodash";
4+
import { flatMap, forEach, isNil, isNumber, keyBy, keys, mapValues, omit, sortBy, uniq, values } from "lodash";
55

66
import { ItemType, Scalar } from "../types";
77
import { inferFieldType } from "./fieldModel";
@@ -21,7 +21,7 @@ import {
2121
export { datasetToString, getEmptyGraphDataset, parseDataset } from "@gephi/gephi-lite-sdk";
2222

2323
export function getRandomNodeCoordinate(): number {
24-
return Math.random() * 100;
24+
return Math.random() * 1000;
2525
}
2626

2727
export function cleanNode(_node: string, attributes: Attributes): { data: ItemData; position: NodeCoordinates } {
@@ -48,7 +48,10 @@ export function cleanEdge(_edge: string, attributes: Attributes): { data: ItemDa
4848
* This function takes any graphology instance (like returned by any graphology
4949
* importer basically), and returns a properly shaped graph dataset:
5050
*/
51-
export function initializeGraphDataset(graph: Graph): GraphDataset {
51+
export function initializeGraphDataset(
52+
graph: Graph,
53+
{ nodeFields, edgeFields }: { nodeFields?: FieldModel<"nodes">[]; edgeFields?: FieldModel<"edges">[] } = {},
54+
): GraphDataset {
5255
const dataset = getEmptyGraphDataset();
5356

5457
// setting graph meta data
@@ -86,20 +89,26 @@ export function initializeGraphDataset(graph: Graph): GraphDataset {
8689
});
8790

8891
// Infer model:
92+
const nodeFieldsDict = keyBy(nodeFields, "id");
93+
const edgeFieldsDict = keyBy(edgeFields, "id");
94+
8995
forEach(nodeAttributeValues, (values, key) => {
90-
dataset.nodeFields.push({
91-
id: key,
92-
itemType: "nodes",
93-
...inferFieldType(key, values, graph.order),
94-
});
96+
dataset.nodeFields.push(
97+
nodeFieldsDict[key] || {
98+
id: key,
99+
itemType: "nodes",
100+
...inferFieldType(key, values, graph.order),
101+
},
102+
);
95103
});
96-
97104
forEach(edgeAttributeValues, (values, key) => {
98-
dataset.edgeFields.push({
99-
id: key,
100-
itemType: "edges",
101-
...inferFieldType(key, values, graph.size),
102-
});
105+
dataset.edgeFields.push(
106+
edgeFieldsDict[key] || {
107+
id: key,
108+
itemType: "edges",
109+
...inferFieldType(key, values, graph.size),
110+
},
111+
);
103112
});
104113

105114
const labelsOrder = ["label", "size", "weight", "color"];
@@ -253,23 +262,3 @@ export function uniqFieldValuesAsStrings(items: Record<string, ItemData>, field:
253262
}),
254263
) as string[];
255264
}
256-
257-
// /**
258-
// * Generate the original graph from the graphDataset.
259-
// */
260-
// export function graphDatasetToGraphData(graphDataset: GraphDataset): DataGraph {
261-
// const graph = graphDataset.fullGraph.copy();
262-
// Object.entries(graphDataset.nodeData).map(([key, value]) => {
263-
// graph.updateNodeAttributes(key, (attrs) => ({ ...attrs, ...value }));
264-
// });
265-
// Object.entries(graphDataset.edgeData).map(([key, value]) => {
266-
// graph.updateEdgeAttributes(key, (attrs) => ({ ...attrs, ...value }));
267-
// });
268-
// Object.entries(graphDataset.nodeRenderingData).map(([key, value]) => {
269-
// graph.updateNodeAttributes(key, (attrs) => ({ ...attrs, ...value }));
270-
// });
271-
// Object.entries(graphDataset.edgeRenderingData).map(([key, value]) => {
272-
// graph.updateEdgeAttributes(key, (attrs) => ({ ...attrs, ...value }));
273-
// });
274-
// return graph;
275-
// }

0 commit comments

Comments
 (0)