From 2d226711799d5ec7bf707369508ae2fc9c360c8d Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Fri, 17 Feb 2023 19:23:34 +0100 Subject: [PATCH 01/85] extracted ArcLabelledBatchGraph from transposeOffline --- src/it/unimi/dsi/webgraph/Transform.java | 398 ++++++++++++----------- 1 file changed, 207 insertions(+), 191 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/Transform.java b/src/it/unimi/dsi/webgraph/Transform.java index 900c576..e172925 100644 --- a/src/it/unimi/dsi/webgraph/Transform.java +++ b/src/it/unimi/dsi/webgraph/Transform.java @@ -1725,236 +1725,252 @@ public static ArcLabelledImmutableGraph transposeOffline(final ArcLabelledImmuta final long numArcs = m; // Now we return an immutable graph whose nodeIterator() merges the batches on the fly. - return new ArcLabelledImmutableSequentialGraph() { - @Override - public int numNodes() { return n; } - @Override - public long numArcs() { return numArcs; } - @Override - public boolean hasCopiableIterators() { return true; } + return new ArcLabelledBatchGraph(n, numArcs, batches, labelBatches, prototype); + } - class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator { - /** The buffer size. We can't make it too big—there's two per batch, per thread. */ - private static final int STD_BUFFER_SIZE = 64 * 1024; - private final int[] refArray; - private final InputBitStream[] batchIbs; - private final InputBitStream[] labelInputBitStream; - private final int[] inputStreamLength; - private final int[] prevTarget; - - // The indirect queue used to merge the batches. - private final IntHeapSemiIndirectPriorityQueue queue; - /** The limit for {@link #hasNext()}. */ - private final int hasNextLimit; - - /** The last returned node (-1 if no node has been returned yet). */ - private int last; - /** The outdegree of the current node (valid if {@link #last} is not -1). */ - private int outdegree; - /** The successors of the current node (valid if {@link #last} is not -1); - * only the first {@link #outdegree} entries are meaningful. */ - private int[] successor; - /** The labels of the arcs going out of the current node (valid if {@link #last} is not -1); - * only the first {@link #outdegree} entries are meaningful. */ - private Label[] label; - - public InternalArcLabelledNodeIterator(final int upperBound) throws IOException { - this(upperBound, null, null, null, null, null, -1, 0, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY); + public static class ArcLabelledBatchGraph extends ArcLabelledImmutableSequentialGraph { + private int n; + private long numArcs; + private ObjectArrayList batches; + private ObjectArrayList labelBatches; + private Label prototype; + + public ArcLabelledBatchGraph(int n, long numArcs, ObjectArrayList batches, ObjectArrayList labelBatches, Label prototype) { + this.n = n; + this.numArcs = numArcs; + this.batches = batches; + this.labelBatches = labelBatches; + this.prototype = prototype; + } + + @Override + public int numNodes() { return n; } + @Override + public long numArcs() { return numArcs; } + @Override + public boolean hasCopiableIterators() { return true; } + + class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator { + /** The buffer size. We can't make it too big—there's two per batch, per thread. */ + private static final int STD_BUFFER_SIZE = 64 * 1024; + private final int[] refArray; + private final InputBitStream[] batchIbs; + private final InputBitStream[] labelInputBitStream; + private final int[] inputStreamLength; + private final int[] prevTarget; + + // The indirect queue used to merge the batches. + private final IntHeapSemiIndirectPriorityQueue queue; + /** The limit for {@link #hasNext()}. */ + private final int hasNextLimit; + + /** The last returned node (-1 if no node has been returned yet). */ + private int last; + /** The outdegree of the current node (valid if {@link #last} is not -1). */ + private int outdegree; + /** The successors of the current node (valid if {@link #last} is not -1); + * only the first {@link #outdegree} entries are meaningful. */ + private int[] successor; + /** The labels of the arcs going out of the current node (valid if {@link #last} is not -1); + * only the first {@link #outdegree} entries are meaningful. */ + private Label[] label; + + public InternalArcLabelledNodeIterator(final int upperBound) throws IOException { + this(upperBound, null, null, null, null, null, -1, 0, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY); + } + + public InternalArcLabelledNodeIterator(final int upperBound, final InputBitStream[] baseIbs, final InputBitStream[] baseLabelInputBitStream, final int[] refArray, final int[] prevTarget, final int[] inputStreamLength, final int last, final int outdegree, final int successor[], final Label[] label) throws IOException { + this.hasNextLimit = Math.min(n, upperBound) - 1; + this.last = last; + this.outdegree = outdegree; + this.successor = successor; + this.label = label; + batchIbs = new InputBitStream[batches.size()]; + labelInputBitStream = new InputBitStream[batches.size()]; + + if (refArray == null) { + this.refArray = new int[batches.size()]; + this.prevTarget = new int[batches.size()]; + this.inputStreamLength = new int[batches.size()]; + Arrays.fill(this.prevTarget, -1); + queue = new IntHeapSemiIndirectPriorityQueue(this.refArray); + // We open all files and load the first element into the reference array. + for(int i = 0; i < batches.size(); i++) { + batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); + labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); + this.inputStreamLength[i] = batchIbs[i].readDelta(); + this.refArray[i] = batchIbs[i].readDelta(); + queue.enqueue(i); + } } + else { + this.refArray = refArray; + this.prevTarget = prevTarget; + this.inputStreamLength = inputStreamLength; + queue = new IntHeapSemiIndirectPriorityQueue(refArray); - public InternalArcLabelledNodeIterator(final int upperBound, final InputBitStream[] baseIbs, final InputBitStream[] baseLabelInputBitStream, final int[] refArray, final int[] prevTarget, final int[] inputStreamLength, final int last, final int outdegree, final int successor[], final Label[] label) throws IOException { - this.hasNextLimit = Math.min(n, upperBound) - 1; - this.last = last; - this.outdegree = outdegree; - this.successor = successor; - this.label = label; - batchIbs = new InputBitStream[batches.size()]; - labelInputBitStream = new InputBitStream[batches.size()]; - - if (refArray == null) { - this.refArray = new int[batches.size()]; - this.prevTarget = new int[batches.size()]; - this.inputStreamLength = new int[batches.size()]; - Arrays.fill(this.prevTarget, -1); - queue = new IntHeapSemiIndirectPriorityQueue(this.refArray); - // We open all files and load the first element into the reference array. - for(int i = 0; i < batches.size(); i++) { + for(int i = 0; i < refArray.length; i++) { + if (baseIbs[i] != null) { batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); + batchIbs[i].position(baseIbs[i].position()); labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); - this.inputStreamLength[i] = batchIbs[i].readDelta(); - this.refArray[i] = batchIbs[i].readDelta(); + labelInputBitStream[i].position(baseLabelInputBitStream[i].position()); queue.enqueue(i); } } - else { - this.refArray = refArray; - this.prevTarget = prevTarget; - this.inputStreamLength = inputStreamLength; - queue = new IntHeapSemiIndirectPriorityQueue(refArray); - - for(int i = 0; i < refArray.length; i++) { - if (baseIbs[i] != null) { - batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); - batchIbs[i].position(baseIbs[i].position()); - labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); - labelInputBitStream[i].position(baseLabelInputBitStream[i].position()); - queue.enqueue(i); - } - } - } } + } - @Override - public int outdegree() { - if (last == -1) throw new IllegalStateException(); - return outdegree; - } + @Override + public int outdegree() { + if (last == -1) throw new IllegalStateException(); + return outdegree; + } - @Override - public boolean hasNext() { - return last < hasNextLimit; - } + @Override + public boolean hasNext() { + return last < hasNextLimit; + } - @Override - public int nextInt() { - last++; - int d = 0; - int i; - - try { - /* We extract elements from the queue as long as their target is equal - * to last. If during the process we exhaust a batch, we close it. */ - - while(! queue.isEmpty() && refArray[i = queue.first()] == last) { - successor = IntArrays.grow(successor, d + 1); - successor[d] = (prevTarget[i] += batchIbs[i].readDelta() + 1); - label = ObjectArrays.grow(label, d + 1); - label[d] = prototype.copy(); - label[d].fromBitStream(labelInputBitStream[i], last); - - if (--inputStreamLength[i] == 0) { - queue.dequeue(); - batchIbs[i].close(); - labelInputBitStream[i].close(); - batchIbs[i] = null; - labelInputBitStream[i] = null; - } - else { - // We read a new source and update the queue. - final int sourceDelta = batchIbs[i].readDelta(); - if (sourceDelta != 0) { - refArray[i] += sourceDelta; - prevTarget[i] = -1; - queue.changed(); - } - } - d++; - } - // Neither quicksort nor heaps are stable, so we reestablish order here. - it.unimi.dsi.fastutil.Arrays.quickSort(0, d, (x, y) -> Integer.compare(successor[x], successor[y]), - (x, y) -> { - final int t = successor[x]; - successor[x] = successor[y]; - successor[y] = t; - final Label l = label[x]; - label[x] = label[y]; - label[y] = l; - }); - } - catch(final IOException e) { - throw new RuntimeException(e); - } + @Override + public int nextInt() { + last++; + int d = 0; + int i; - outdegree = d; - return last; - } + try { + /* We extract elements from the queue as long as their target is equal + * to last. If during the process we exhaust a batch, we close it. */ - @Override - public int[] successorArray() { - if (last == -1) throw new IllegalStateException(); - return successor; - } + while(! queue.isEmpty() && refArray[i = queue.first()] == last) { + successor = IntArrays.grow(successor, d + 1); + successor[d] = (prevTarget[i] += batchIbs[i].readDelta() + 1); + label = ObjectArrays.grow(label, d + 1); + label[d] = prototype.copy(); + label[d].fromBitStream(labelInputBitStream[i], last); - @SuppressWarnings("deprecation") - @Override - protected void finalize() throws Throwable { - try { - for(final InputBitStream ibs: batchIbs) if (ibs != null) ibs.close(); - for(final InputBitStream ibs: labelInputBitStream) if (ibs != null) ibs.close(); - } - finally { - super.finalize(); + if (--inputStreamLength[i] == 0) { + queue.dequeue(); + batchIbs[i].close(); + labelInputBitStream[i].close(); + batchIbs[i] = null; + labelInputBitStream[i] = null; + } + else { + // We read a new source and update the queue. + final int sourceDelta = batchIbs[i].readDelta(); + if (sourceDelta != 0) { + refArray[i] += sourceDelta; + prevTarget[i] = -1; + queue.changed(); + } + } + d++; } + // Neither quicksort nor heaps are stable, so we reestablish order here. + it.unimi.dsi.fastutil.Arrays.quickSort(0, d, (x, y) -> Integer.compare(successor[x], successor[y]), + (x, y) -> { + final int t = successor[x]; + successor[x] = successor[y]; + successor[y] = t; + final Label l = label[x]; + label[x] = label[y]; + label[y] = l; + }); + } + catch(final IOException e) { + throw new RuntimeException(e); } - @Override - public LabelledArcIterator successors() { - if (last == -1) throw new IllegalStateException(); - return new LabelledArcIterator() { - int last = -1; - - @Override - public Label label() { - return label[last]; - } + outdegree = d; + return last; + } - @Override - public int nextInt() { - if (last + 1 == outdegree) return -1; - return successor[++last]; - } + @Override + public int[] successorArray() { + if (last == -1) throw new IllegalStateException(); + return successor; + } - @Override - public int skip(final int k) { - final int toSkip = Math.min(k, outdegree - last - 1); - last += toSkip; - return toSkip; - } - }; + @SuppressWarnings("deprecation") + @Override + protected void finalize() throws Throwable { + try { + for(final InputBitStream ibs: batchIbs) if (ibs != null) ibs.close(); + for(final InputBitStream ibs: labelInputBitStream) if (ibs != null) ibs.close(); } + finally { + super.finalize(); + } + } + @Override + public LabelledArcIterator successors() { + if (last == -1) throw new IllegalStateException(); + return new LabelledArcIterator() { + int last = -1; - @Override - public ArcLabelledNodeIterator copy(final int upperBound) { - try { - if (last == -1) return new InternalArcLabelledNodeIterator(upperBound); - else return new InternalArcLabelledNodeIterator(upperBound, batchIbs, labelInputBitStream, - refArray.clone(), prevTarget.clone(), inputStreamLength.clone(), last, outdegree, Arrays.copyOf(successor, outdegree), Arrays.copyOf(label, outdegree)); + @Override + public Label label() { + return label[last]; + } + + @Override + public int nextInt() { + if (last + 1 == outdegree) return -1; + return successor[++last]; } - catch (final IOException e) { - throw new RuntimeException(e); + + @Override + public int skip(final int k) { + final int toSkip = Math.min(k, outdegree - last - 1); + last += toSkip; + return toSkip; } - } + }; } @Override - public ArcLabelledNodeIterator nodeIterator() { + public ArcLabelledNodeIterator copy(final int upperBound) { try { - return new InternalArcLabelledNodeIterator(Integer.MAX_VALUE); + if (last == -1) return new InternalArcLabelledNodeIterator(upperBound); + else return new InternalArcLabelledNodeIterator(upperBound, batchIbs, labelInputBitStream, + refArray.clone(), prevTarget.clone(), inputStreamLength.clone(), last, outdegree, Arrays.copyOf(successor, outdegree), Arrays.copyOf(label, outdegree)); } catch (final IOException e) { throw new RuntimeException(e); } } + } - @SuppressWarnings("deprecation") - @Override - protected void finalize() throws Throwable { - try { - for(final File f : batches) f.delete(); - for(final File f : labelBatches) f.delete(); - } - finally { - super.finalize(); - } + + @Override + public ArcLabelledNodeIterator nodeIterator() { + try { + return new InternalArcLabelledNodeIterator(Integer.MAX_VALUE); } - @Override - public Label prototype() { - return prototype; + catch (final IOException e) { + throw new RuntimeException(e); } + } - }; + @SuppressWarnings("deprecation") + @Override + protected void finalize() throws Throwable { + try { + for(final File f : batches) f.delete(); + for(final File f : labelBatches) f.delete(); + } + finally { + super.finalize(); + } + } + + @Override + public Label prototype() { + return prototype; + } } From b06c52581c34a06a0c71f1b9290113f33294993b Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Sat, 18 Feb 2023 16:00:53 +0100 Subject: [PATCH 02/85] moved ArcLabelledImmutableGraph, changed processTransposeBatch to return the number of unique pairs --- src/it/unimi/dsi/webgraph/Transform.java | 543 ++++++++++++----------- 1 file changed, 275 insertions(+), 268 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/Transform.java b/src/it/unimi/dsi/webgraph/Transform.java index e172925..af8d145 100644 --- a/src/it/unimi/dsi/webgraph/Transform.java +++ b/src/it/unimi/dsi/webgraph/Transform.java @@ -1271,6 +1271,252 @@ protected void finalize() throws Throwable { } + public static class ArcLabelledBatchGraph extends ArcLabelledImmutableSequentialGraph { + private final int n; + private final long numArcs; + private final ObjectArrayList batches; + private final ObjectArrayList labelBatches; + private final Label prototype; + + public ArcLabelledBatchGraph(int n, long numArcs, ObjectArrayList batches, ObjectArrayList labelBatches, Label prototype) { + this.n = n; + this.numArcs = numArcs; + this.batches = batches; + this.labelBatches = labelBatches; + this.prototype = prototype; + } + + @Override + public int numNodes() { return n; } + @Override + public long numArcs() { return numArcs; } + @Override + public boolean hasCopiableIterators() { return true; } + + class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator { + /** The buffer size. We can't make it too big—there's two per batch, per thread. */ + private static final int STD_BUFFER_SIZE = 64 * 1024; + private final int[] refArray; + private final InputBitStream[] batchIbs; + private final InputBitStream[] labelInputBitStream; + private final int[] inputStreamLength; + private final int[] prevTarget; + + // The indirect queue used to merge the batches. + private final IntHeapSemiIndirectPriorityQueue queue; + /** The limit for {@link #hasNext()}. */ + private final int hasNextLimit; + + /** The last returned node (-1 if no node has been returned yet). */ + private int last; + /** The outdegree of the current node (valid if {@link #last} is not -1). */ + private int outdegree; + /** The successors of the current node (valid if {@link #last} is not -1); + * only the first {@link #outdegree} entries are meaningful. */ + private int[] successor; + /** The labels of the arcs going out of the current node (valid if {@link #last} is not -1); + * only the first {@link #outdegree} entries are meaningful. */ + private Label[] label; + + public InternalArcLabelledNodeIterator(final int upperBound) throws IOException { + this(upperBound, null, null, null, null, null, -1, 0, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY); + } + + public InternalArcLabelledNodeIterator(final int upperBound, final InputBitStream[] baseIbs, final InputBitStream[] baseLabelInputBitStream, final int[] refArray, final int[] prevTarget, final int[] inputStreamLength, final int last, final int outdegree, final int successor[], final Label[] label) throws IOException { + this.hasNextLimit = Math.min(n, upperBound) - 1; + this.last = last; + this.outdegree = outdegree; + this.successor = successor; + this.label = label; + batchIbs = new InputBitStream[batches.size()]; + labelInputBitStream = new InputBitStream[batches.size()]; + + if (refArray == null) { + this.refArray = new int[batches.size()]; + this.prevTarget = new int[batches.size()]; + this.inputStreamLength = new int[batches.size()]; + Arrays.fill(this.prevTarget, -1); + queue = new IntHeapSemiIndirectPriorityQueue(this.refArray); + // We open all files and load the first element into the reference array. + for(int i = 0; i < batches.size(); i++) { + batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); + labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); + this.inputStreamLength[i] = batchIbs[i].readDelta(); + this.refArray[i] = batchIbs[i].readDelta(); + queue.enqueue(i); + } + } + else { + this.refArray = refArray; + this.prevTarget = prevTarget; + this.inputStreamLength = inputStreamLength; + queue = new IntHeapSemiIndirectPriorityQueue(refArray); + + for(int i = 0; i < refArray.length; i++) { + if (baseIbs[i] != null) { + batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); + batchIbs[i].position(baseIbs[i].position()); + labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); + labelInputBitStream[i].position(baseLabelInputBitStream[i].position()); + queue.enqueue(i); + } + } + } + } + + @Override + public int outdegree() { + if (last == -1) throw new IllegalStateException(); + return outdegree; + } + + @Override + public boolean hasNext() { + return last < hasNextLimit; + } + + @Override + public int nextInt() { + last++; + int d = 0; + int i; + + try { + /* We extract elements from the queue as long as their target is equal + * to last. If during the process we exhaust a batch, we close it. */ + + while(! queue.isEmpty() && refArray[i = queue.first()] == last) { + successor = IntArrays.grow(successor, d + 1); + successor[d] = (prevTarget[i] += batchIbs[i].readDelta() + 1); + label = ObjectArrays.grow(label, d + 1); + label[d] = prototype.copy(); + label[d].fromBitStream(labelInputBitStream[i], last); + + if (--inputStreamLength[i] == 0) { + queue.dequeue(); + batchIbs[i].close(); + labelInputBitStream[i].close(); + batchIbs[i] = null; + labelInputBitStream[i] = null; + } + else { + // We read a new source and update the queue. + final int sourceDelta = batchIbs[i].readDelta(); + if (sourceDelta != 0) { + refArray[i] += sourceDelta; + prevTarget[i] = -1; + queue.changed(); + } + } + d++; + } + // Neither quicksort nor heaps are stable, so we reestablish order here. + it.unimi.dsi.fastutil.Arrays.quickSort(0, d, (x, y) -> Integer.compare(successor[x], successor[y]), + (x, y) -> { + final int t = successor[x]; + successor[x] = successor[y]; + successor[y] = t; + final Label l = label[x]; + label[x] = label[y]; + label[y] = l; + }); + } + catch(final IOException e) { + throw new RuntimeException(e); + } + + outdegree = d; + return last; + } + + @Override + public int[] successorArray() { + if (last == -1) throw new IllegalStateException(); + return successor; + } + + @SuppressWarnings("deprecation") + @Override + protected void finalize() throws Throwable { + try { + for(final InputBitStream ibs: batchIbs) if (ibs != null) ibs.close(); + for(final InputBitStream ibs: labelInputBitStream) if (ibs != null) ibs.close(); + } + finally { + super.finalize(); + } + } + + @Override + public LabelledArcIterator successors() { + if (last == -1) throw new IllegalStateException(); + return new LabelledArcIterator() { + int last = -1; + + @Override + public Label label() { + return label[last]; + } + + @Override + public int nextInt() { + if (last + 1 == outdegree) return -1; + return successor[++last]; + } + + @Override + public int skip(final int k) { + final int toSkip = Math.min(k, outdegree - last - 1); + last += toSkip; + return toSkip; + } + }; + } + + + @Override + public ArcLabelledNodeIterator copy(final int upperBound) { + try { + if (last == -1) return new InternalArcLabelledNodeIterator(upperBound); + else return new InternalArcLabelledNodeIterator(upperBound, batchIbs, labelInputBitStream, + refArray.clone(), prevTarget.clone(), inputStreamLength.clone(), last, outdegree, Arrays.copyOf(successor, outdegree), Arrays.copyOf(label, outdegree)); + } + catch (final IOException e) { + throw new RuntimeException(e); + } + } + } + + + @Override + public ArcLabelledNodeIterator nodeIterator() { + try { + return new InternalArcLabelledNodeIterator(Integer.MAX_VALUE); + } + catch (final IOException e) { + throw new RuntimeException(e); + } + } + + @SuppressWarnings("deprecation") + @Override + protected void finalize() throws Throwable { + try { + for(final File f : batches) f.delete(); + for(final File f : labelBatches) f.delete(); + } + finally { + super.finalize(); + } + } + + @Override + public Label prototype() { + return prototype; + } + } + + /** Sorts the given source and target arrays w.r.t. the target and stores them in a temporary file. * * @param n the index of the last element to be sorted (exclusive). @@ -1331,35 +1577,40 @@ else if (target[i] != target[i - 1]) { * @param tempDir a temporary directory where to store the sorted arrays. * @param batches a list of files to which the batch file will be added. * @param labelBatches a list of files to which the label batch file will be added. + * @return the number of pairs in the batch (might be less than n because duplicates are eliminated). */ - private static void processTransposeBatch(final int n, final int[] source, final int[] target, final long[] start, - final InputBitStream labelBitStream, final File tempDir, final List batches, final List labelBatches, - final Label prototype) throws IOException { + public static int processTransposeBatch(final int n, final int[] source, final int[] target, final long[] start, + final InputBitStream labelBitStream, final File tempDir, final List batches, final List labelBatches, + final Label prototype) throws IOException { it.unimi.dsi.fastutil.Arrays.parallelQuickSort(0, n, (x,y) -> { - final int t = Integer.compare(source[x], source[y]); - if (t != 0) return t; - return Integer.compare(target[x], target[y]); - }, - (x, y) -> { - int t = source[x]; - source[x] = source[y]; - source[y] = t; - t = target[x]; - target[x] = target[y]; - target[y] = t; - final long u = start[x]; - start[x] = start[y]; - start[y] = u; - }); + final int t = Integer.compare(source[x], source[y]); + if (t != 0) return t; + return Integer.compare(target[x], target[y]); + }, + (x, y) -> { + int t = source[x]; + source[x] = source[y]; + source[y] = t; + t = target[x]; + target[x] = target[y]; + target[y] = t; + final long u = start[x]; + start[x] = start[y]; + start[y] = u; + }); final File batchFile = File.createTempFile("batch", ".bitstream", tempDir); batchFile.deleteOnExit(); batches.add(batchFile); final OutputBitStream batch = new OutputBitStream(batchFile); + int u = 0; if (n != 0) { // Compute unique pairs + u = 1; + for(int i = n - 1; i-- != 0;) if (source[i] != source[i + 1] || target[i] != target[i + 1]) u++; + batch.writeDelta(n); int prevSource = source[0]; batch.writeDelta(prevSource); @@ -1392,6 +1643,8 @@ else if (target[i] != target[i - 1]) { prototype.toBitStream(labelObs, target[i]); } labelObs.close(); + + return u; } /** Returns an immutable graph obtained by reversing all arcs in g, using an offline method. @@ -1728,252 +1981,6 @@ public static ArcLabelledImmutableGraph transposeOffline(final ArcLabelledImmuta return new ArcLabelledBatchGraph(n, numArcs, batches, labelBatches, prototype); } - public static class ArcLabelledBatchGraph extends ArcLabelledImmutableSequentialGraph { - private int n; - private long numArcs; - private ObjectArrayList batches; - private ObjectArrayList labelBatches; - private Label prototype; - - public ArcLabelledBatchGraph(int n, long numArcs, ObjectArrayList batches, ObjectArrayList labelBatches, Label prototype) { - this.n = n; - this.numArcs = numArcs; - this.batches = batches; - this.labelBatches = labelBatches; - this.prototype = prototype; - } - - @Override - public int numNodes() { return n; } - @Override - public long numArcs() { return numArcs; } - @Override - public boolean hasCopiableIterators() { return true; } - - class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator { - /** The buffer size. We can't make it too big—there's two per batch, per thread. */ - private static final int STD_BUFFER_SIZE = 64 * 1024; - private final int[] refArray; - private final InputBitStream[] batchIbs; - private final InputBitStream[] labelInputBitStream; - private final int[] inputStreamLength; - private final int[] prevTarget; - - // The indirect queue used to merge the batches. - private final IntHeapSemiIndirectPriorityQueue queue; - /** The limit for {@link #hasNext()}. */ - private final int hasNextLimit; - - /** The last returned node (-1 if no node has been returned yet). */ - private int last; - /** The outdegree of the current node (valid if {@link #last} is not -1). */ - private int outdegree; - /** The successors of the current node (valid if {@link #last} is not -1); - * only the first {@link #outdegree} entries are meaningful. */ - private int[] successor; - /** The labels of the arcs going out of the current node (valid if {@link #last} is not -1); - * only the first {@link #outdegree} entries are meaningful. */ - private Label[] label; - - public InternalArcLabelledNodeIterator(final int upperBound) throws IOException { - this(upperBound, null, null, null, null, null, -1, 0, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY); - } - - public InternalArcLabelledNodeIterator(final int upperBound, final InputBitStream[] baseIbs, final InputBitStream[] baseLabelInputBitStream, final int[] refArray, final int[] prevTarget, final int[] inputStreamLength, final int last, final int outdegree, final int successor[], final Label[] label) throws IOException { - this.hasNextLimit = Math.min(n, upperBound) - 1; - this.last = last; - this.outdegree = outdegree; - this.successor = successor; - this.label = label; - batchIbs = new InputBitStream[batches.size()]; - labelInputBitStream = new InputBitStream[batches.size()]; - - if (refArray == null) { - this.refArray = new int[batches.size()]; - this.prevTarget = new int[batches.size()]; - this.inputStreamLength = new int[batches.size()]; - Arrays.fill(this.prevTarget, -1); - queue = new IntHeapSemiIndirectPriorityQueue(this.refArray); - // We open all files and load the first element into the reference array. - for(int i = 0; i < batches.size(); i++) { - batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); - labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); - this.inputStreamLength[i] = batchIbs[i].readDelta(); - this.refArray[i] = batchIbs[i].readDelta(); - queue.enqueue(i); - } - } - else { - this.refArray = refArray; - this.prevTarget = prevTarget; - this.inputStreamLength = inputStreamLength; - queue = new IntHeapSemiIndirectPriorityQueue(refArray); - - for(int i = 0; i < refArray.length; i++) { - if (baseIbs[i] != null) { - batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); - batchIbs[i].position(baseIbs[i].position()); - labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); - labelInputBitStream[i].position(baseLabelInputBitStream[i].position()); - queue.enqueue(i); - } - } - } - } - - @Override - public int outdegree() { - if (last == -1) throw new IllegalStateException(); - return outdegree; - } - - @Override - public boolean hasNext() { - return last < hasNextLimit; - } - - @Override - public int nextInt() { - last++; - int d = 0; - int i; - - try { - /* We extract elements from the queue as long as their target is equal - * to last. If during the process we exhaust a batch, we close it. */ - - while(! queue.isEmpty() && refArray[i = queue.first()] == last) { - successor = IntArrays.grow(successor, d + 1); - successor[d] = (prevTarget[i] += batchIbs[i].readDelta() + 1); - label = ObjectArrays.grow(label, d + 1); - label[d] = prototype.copy(); - label[d].fromBitStream(labelInputBitStream[i], last); - - if (--inputStreamLength[i] == 0) { - queue.dequeue(); - batchIbs[i].close(); - labelInputBitStream[i].close(); - batchIbs[i] = null; - labelInputBitStream[i] = null; - } - else { - // We read a new source and update the queue. - final int sourceDelta = batchIbs[i].readDelta(); - if (sourceDelta != 0) { - refArray[i] += sourceDelta; - prevTarget[i] = -1; - queue.changed(); - } - } - d++; - } - // Neither quicksort nor heaps are stable, so we reestablish order here. - it.unimi.dsi.fastutil.Arrays.quickSort(0, d, (x, y) -> Integer.compare(successor[x], successor[y]), - (x, y) -> { - final int t = successor[x]; - successor[x] = successor[y]; - successor[y] = t; - final Label l = label[x]; - label[x] = label[y]; - label[y] = l; - }); - } - catch(final IOException e) { - throw new RuntimeException(e); - } - - outdegree = d; - return last; - } - - @Override - public int[] successorArray() { - if (last == -1) throw new IllegalStateException(); - return successor; - } - - @SuppressWarnings("deprecation") - @Override - protected void finalize() throws Throwable { - try { - for(final InputBitStream ibs: batchIbs) if (ibs != null) ibs.close(); - for(final InputBitStream ibs: labelInputBitStream) if (ibs != null) ibs.close(); - } - finally { - super.finalize(); - } - } - - @Override - public LabelledArcIterator successors() { - if (last == -1) throw new IllegalStateException(); - return new LabelledArcIterator() { - int last = -1; - - @Override - public Label label() { - return label[last]; - } - - @Override - public int nextInt() { - if (last + 1 == outdegree) return -1; - return successor[++last]; - } - - @Override - public int skip(final int k) { - final int toSkip = Math.min(k, outdegree - last - 1); - last += toSkip; - return toSkip; - } - }; - } - - - @Override - public ArcLabelledNodeIterator copy(final int upperBound) { - try { - if (last == -1) return new InternalArcLabelledNodeIterator(upperBound); - else return new InternalArcLabelledNodeIterator(upperBound, batchIbs, labelInputBitStream, - refArray.clone(), prevTarget.clone(), inputStreamLength.clone(), last, outdegree, Arrays.copyOf(successor, outdegree), Arrays.copyOf(label, outdegree)); - } - catch (final IOException e) { - throw new RuntimeException(e); - } - } - } - - - @Override - public ArcLabelledNodeIterator nodeIterator() { - try { - return new InternalArcLabelledNodeIterator(Integer.MAX_VALUE); - } - catch (final IOException e) { - throw new RuntimeException(e); - } - } - - @SuppressWarnings("deprecation") - @Override - protected void finalize() throws Throwable { - try { - for(final File f : batches) f.delete(); - for(final File f : labelBatches) f.delete(); - } - finally { - super.finalize(); - } - } - - @Override - public Label prototype() { - return prototype; - } - } - - /** Returns an immutable graph obtained by reversing all arcs in g. * *

This method can process {@linkplain ImmutableGraph#loadOffline(CharSequence) offline graphs}. @@ -2618,8 +2625,8 @@ public static void main(final String args[]) throws IOException, IllegalArgument "transposeOffline sourceBasename destBasename [batchSize] [tempDir]\n" + "symmetrize sourceBasename [transposeBasename] destBasename\n" + "symmetrizeOffline sourceBasename destBasename [batchSize] [tempDir]\n" + - "simplifyOffline sourceBasename destBasename [batchSize] [tempDir]\n" + - "simplify sourceBasename transposeBasename destBasename\n" + + "simplifyOffline sourceBasename destBasename [batchSize] [tempDir]\n" + + "simplify sourceBasename transposeBasename destBasename\n" + "union source1Basename source2Basename destBasename [strategy]\n" + "compose source1Basename source2Basename destBasename [semiring]\n" + "gray sourceBasename destBasename\n" + @@ -2646,8 +2653,8 @@ public static void main(final String args[]) throws IOException, IllegalArgument new Switch("ascii", 'a', "ascii", "Maps are in ASCII form (one integer per line)."), new UnflaggedOption("transform", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The transformation to be applied."), new UnflaggedOption("param", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.GREEDY, "The remaining parameters."), - } - ); + } + ); final JSAPResult jsapResult = jsap.parse(args); if (jsap.messagePrinted()) System.exit(1); From a1f6db1bc67f158ef9784de76e24758e8f09ed1e Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Sat, 18 Feb 2023 16:03:40 +0100 Subject: [PATCH 03/85] created ScatteredLabelledArcsASCIIGraph by copying ScatteredArcsASCIIGraph and adapting the constructors (wip) --- .../ScatteredLabelledArcsASCIIGraph.java | 918 ++++++++++++++++++ 1 file changed, 918 insertions(+) create mode 100644 src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java diff --git a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java new file mode 100644 index 0000000..a69eaf7 --- /dev/null +++ b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java @@ -0,0 +1,918 @@ +/* + * Copyright (C) 2011-2023 Sebastiano Vigna + * + * This program and the accompanying materials are made available under the + * terms of the GNU Lesser General Public License v2.1 or later, + * which is available at + * http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html, + * or the Apache Software License 2.0, which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. + * + * SPDX-License-Identifier: LGPL-2.1-or-later OR Apache-2.0 + */ + +package it.unimi.dsi.webgraph.labelling; + +import it.unimi.dsi.Util; +import it.unimi.dsi.fastutil.BigArrays; +import it.unimi.dsi.fastutil.Hash; +import it.unimi.dsi.fastutil.booleans.BooleanBigArrays; +import it.unimi.dsi.fastutil.bytes.ByteArrays; +import it.unimi.dsi.fastutil.ints.IntBigArrays; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.io.FastBufferedInputStream; +import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream; +import it.unimi.dsi.fastutil.longs.LongBigArrays; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; +import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.webgraph.ImmutableSequentialGraph; +import it.unimi.dsi.webgraph.Transform; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Iterator; + +import static it.unimi.dsi.fastutil.HashCommon.bigArraySize; +import static it.unimi.dsi.fastutil.HashCommon.maxFill; +import static it.unimi.dsi.webgraph.Transform.processTransposeBatch; + + +/** + * Da riscrivere + */ + +public class ScatteredLabelledArcsASCIIGraph extends ImmutableSequentialGraph { + private static final Logger LOGGER = LoggerFactory.getLogger(ScatteredLabelledArcsASCIIGraph.class); + + // TODO: rollback to false + private final static boolean DEBUG = true; + + /** + * The default batch size. + */ + public static final int DEFAULT_BATCH_SIZE = 1000000; + /** + * The extension of the identifier file (a binary list of longs). + */ + private static final String IDS_EXTENSION = ".ids"; + /** + * The batch graph used to return node iterators. + */ + private final Transform.ArcLabelledBatchGraph arcLabelledBatchGraph; + /** + * The list of identifiers in order of appearance. + */ + public long[] ids; + + private static final class Long2IntOpenHashBigMap implements java.io.Serializable, Cloneable, Hash { + public static final long serialVersionUID = 0L; + + /** + * The big array of keys. + */ + public transient long[][] key; + + /** + * The big array of values. + */ + public transient int[][] value; + + /** + * The big array telling whether a position is used. + */ + private transient boolean[][] used; + + /** + * The acceptable load factor. + */ + private final float f; + + /** + * The current table size (always a power of 2). + */ + private transient long n; + + /** + * Threshold after which we rehash. It must be the table size times {@link #f}. + */ + private transient long maxFill; + + /** + * The mask for wrapping a position counter. + */ + private transient long mask; + + /** + * The mask for wrapping a segment counter. + */ + private transient int segmentMask; + + /** + * The mask for wrapping a base counter. + */ + private transient int baseMask; + + /** + * Number of entries in the set. + */ + private long size; + + /** + * Initialises the mask values. + */ + private void initMasks() { + this.mask = this.n - 1; + /* + * Note that either we have more than one segment, and in this case all segments are + * BigArrays.SEGMENT_SIZE long, or we have exactly one segment whose length is a power of + * two. + */ + this.segmentMask = this.key[0].length - 1; + this.baseMask = this.key.length - 1; + } + + /** + * Creates a new hash big set. + * + *

The actual table size will be the least power of two greater than + * expected/f. + * + * @param expected the expected number of elements in the set. + * @param f the load factor. + */ + public Long2IntOpenHashBigMap(final long expected, final float f) { + if (f <= 0 || f > 1) + throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); + if (this.n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); + this.f = f; + this.n = bigArraySize(expected, f); + this.maxFill = maxFill(this.n, f); + this.key = LongBigArrays.newBigArray(this.n); + this.value = IntBigArrays.newBigArray(this.n); + this.used = BooleanBigArrays.newBigArray(this.n); + this.initMasks(); + } + + /** + * Creates a new hash big set with initial expected {@link Hash#DEFAULT_INITIAL_SIZE} elements + * and {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. + */ + + public Long2IntOpenHashBigMap() { + this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); + } + + public int put(final long k, final int v) { + final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int displ = (int) (h & this.segmentMask); + int base = (int) ((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); + + // There's always an unused entry. + while (this.used[base][displ]) { + if (k == this.key[base][displ]) { + final int oldValue = this.value[base][displ]; + this.value[base][displ] = v; + return oldValue; + } + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + + this.used[base][displ] = true; + this.key[base][displ] = k; + this.value[base][displ] = v; + + if (++this.size >= this.maxFill) this.rehash(2 * this.n); + return -1; + } + + public int get(final long k) { + final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int displ = (int) (h & this.segmentMask); + int base = (int) ((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); + + // There's always an unused entry. + while (this.used[base][displ]) { + if (k == this.key[base][displ]) return this.value[base][displ]; + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + + return -1; + } + + private void rehash(final long newN) { + final boolean[][] used = this.used; + final long[][] key = this.key; + final int[][] value = this.value; + final boolean[][] newUsed = BooleanBigArrays.newBigArray(newN); + final long[][] newKey = LongBigArrays.newBigArray(newN); + final int[][] newValue = IntBigArrays.newBigArray(newN); + final long newMask = newN - 1; + final int newSegmentMask = newKey[0].length - 1; + final int newBaseMask = newKey.length - 1; + + int base = 0, displ = 0; + long h; + long k; + + for (long i = this.size; i-- != 0; ) { + + while (!used[base][displ]) + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); + + k = key[base][displ]; + h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int d = (int) (h & newSegmentMask); + int b = (int) ((h & newMask) >>> BigArrays.SEGMENT_SHIFT); + + while (newUsed[b][d]) + b = (b + ((d = (d + 1) & newSegmentMask) == 0 ? 1 : 0)) & newBaseMask; + + newUsed[b][d] = true; + newKey[b][d] = k; + newValue[b][d] = value[base][displ]; + + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); + } + + this.n = newN; + this.key = newKey; + this.value = newValue; + this.used = newUsed; + this.initMasks(); + this.maxFill = maxFill(this.n, this.f); + } + + public void compact() { + int base = 0, displ = 0, b = 0, d = 0; + for (long i = this.size; i-- != 0; ) { + while (!this.used[base][displ]) + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + this.key[b][d] = this.key[base][displ]; + this.value[b][d] = this.value[base][displ]; + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + b = (b + ((d = (d + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + } + + public long size() { + return this.size; + } + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping) throws IOException { + this(is, labelPrototype, labelMapping, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final boolean symmetrize) throws IOException { + this(is, labelPrototype, labelMapping, symmetrize, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final boolean symmetrize, final boolean noLoops) throws IOException { + this(is, labelPrototype, labelMapping, symmetrize, noLoops, DEFAULT_BATCH_SIZE); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final boolean symmetrize, final boolean noLoops, final int batchSize) throws IOException { + this(is, labelPrototype, labelMapping, symmetrize, noLoops, batchSize, null); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + * @param tempDir a temporary directory for the batches, or null for {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir) throws IOException { + this(is, labelPrototype, labelMapping, symmetrize, noLoops, batchSize, tempDir, null); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + * @param tempDir a temporary directory for the batches, or null for {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + this(is, null, labelPrototype, labelMapping, null, -1, symmetrize, noLoops, batchSize, tempDir, pl); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final Charset charset, final int n) throws IOException { + this(is, function, labelPrototype, labelMapping, charset, n, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final Charset charset, final int n, final boolean symmetrize) throws IOException { + this(is, function, labelPrototype, labelMapping, charset, n, symmetrize, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops) throws IOException { + this(is, function, labelPrototype, labelMapping, charset, n, symmetrize, noLoops, DEFAULT_BATCH_SIZE); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize) throws IOException { + this(is, function, labelPrototype, labelMapping, charset, n, symmetrize, noLoops, batchSize, null); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + * @param tempDir a temporary directory for the batches, or null for {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir) throws IOException { + this(is, function, labelPrototype, labelMapping, charset, n, symmetrize, noLoops, batchSize, tempDir, null); + } + + + // TODO: Move somewhere else + // Given a label prototype and a value set the value inside the label without creating a new one + // it's like a setter, but for labels. + public interface LabelMapping { + void apply(Label prototype, String representation); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + * @param tempDir a temporary directory for the batches, or null for {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); + ScatteredLabelledArcsASCIIGraph.Long2IntOpenHashBigMap map = new ScatteredLabelledArcsASCIIGraph.Long2IntOpenHashBigMap(); + + int numNodes = -1; + if (charset == null) charset = StandardCharsets.ISO_8859_1; + + int j; + int[] source = new int[batchSize], target = new int[batchSize]; + final long[] labelStart = new long[batchSize]; + FastByteArrayOutputStream fbos = new FastByteArrayOutputStream(); + OutputBitStream obs = new OutputBitStream(fbos); + final ObjectArrayList batches = new ObjectArrayList<>(), + labelBatches = new ObjectArrayList<>(); + final Label prototype = labelPrototype.copy(); + + if (pl != null) { + pl.itemsName = "labelled arcs"; + pl.start("Creating sorted batches..."); + } + + j = 0; + long pairs = 0; // Number of pairs + byte[] array = new byte[1024]; + for (long line = 1; ; line++) { + int start = 0, len; + while ((len = fbis.readLine(array, start, array.length - start, FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) { + start += len; + array = ByteArrays.grow(array, array.length + 1); + } + + if (len == -1) break; // EOF + + final int lineLength = start + len; + + if (DEBUG) + System.err.println("Reading line " + line + "... (" + new String(array, 0, lineLength, charset) + ")"); + + // Skip whitespace at the start of the line. + int offset = 0; + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset == lineLength) { + if (DEBUG) System.err.println("Skipping line " + line + "..."); + continue; // Whitespace line + } + + if (array[0] == '#') continue; + + // Scan source id. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; + + int s; + + if (function == null) { + final long sl; + try { + sl = getLong(array, start, offset - start); + } catch (final RuntimeException e) { + // Discard up to the end of line + LOGGER.error("Error at line " + line + ": " + e.getMessage()); + continue; + } + + s = map.get(sl); + if (s == -1) map.put(sl, s = (int) map.size()); + + if (DEBUG) System.err.println("Parsed source at line " + line + ": " + sl + " => " + s); + } else { + final String ss = new String(array, start, offset - start, charset); + final long sl = function.getLong(ss); + if (sl == -1) { + LOGGER.warn("Unknown source identifier " + ss + " at line " + line); + continue; + } + if (sl < 0 || sl >= n) + throw new IllegalArgumentException("Source node number out of range for node " + ss + ": " + sl); + s = (int) sl; + if (DEBUG) System.err.println("Parsed target at line " + line + ": " + ss + " => " + s); + } + + + // Skip whitespace between identifiers. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset == lineLength) { + LOGGER.error("Error at line " + line + ": no target"); + continue; + } + + // Scan target id. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; + + int t; + + if (function == null) { + final long tl; + try { + tl = getLong(array, start, offset - start); + } catch (final RuntimeException e) { + // Discard up to the end of line + LOGGER.error("Error at line " + line + ": " + e.getMessage()); + continue; + } + + t = map.get(tl); + if (t == -1) map.put(tl, t = (int) map.size()); + + if (DEBUG) System.err.println("Parsed target at line " + line + ": " + tl + " => " + t); + } else { + final String ts = new String(array, start, offset - start, charset); + final long tl = function.getLong(ts); + if (tl == -1) { + LOGGER.warn("Unknown target identifier " + ts + " at line " + line); + continue; + } + + if (tl < 0 || tl >= n) + throw new IllegalArgumentException("Target node number out of range for node " + ts + ": " + tl); + t = (int) tl; + if (DEBUG) System.err.println("Parsed target at line " + line + ": " + ts + " => " + t); + } + + // Skip whitespace between identifiers. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset == lineLength) { + LOGGER.error("Error at line " + line + ": no target"); + continue; + } + + // Scan label. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; + + final String ls = new String(array, start, offset - start, charset); + + // Insert current value into the prototype label. + labelMapping.apply(prototype, ls); + if (DEBUG) System.err.println("Parsed label at line " + line + ": " + ls + " => " + prototype.get()); + + + // Skip whitespace after label. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset < lineLength) LOGGER.warn("Trailing characters ignored at line " + line); + + if (DEBUG) System.err.println("Parsed labelled arc at line " + line + ": " + s + " -> " + t + " (" + prototype.get() + ")"); + + if (s != t || !noLoops) { + source[j] = s; + target[j] = t; + labelStart[j] = obs.writtenBits(); + prototype.toBitStream(obs, s); + j++; + + if (symmetrize && s != t) { + source[j] = t; + target[j] = s; + labelStart[j] = obs.writtenBits(); + prototype.toBitStream(obs, t); + j++; + } + + if (j == batchSize) { + obs.flush(); + pairs += processTransposeBatch(batchSize, source, target, labelStart, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype); + fbos = new FastByteArrayOutputStream(); + obs = new OutputBitStream(fbos); + j = 0; + } + + if (pl != null) pl.lightUpdate(); + } + } + + if (j != 0) { + obs.flush(); + pairs += processTransposeBatch(batchSize, source, target, labelStart, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype); + } + + if (pl != null) { + pl.done(); + logBatches(batches, pairs, pl); + } + + numNodes = function == null ? (int) map.size() : function.size(); + source = null; + target = null; + + map.compact(); + + // Non capisco esattamente come mai salvare le chiavi e i valori della mappa per poi ricaricarli? + // Riguarda il memory management? Chiedere! + // Per ora lascio così com'è, ma se è da fare farei un terzo file e usere BinIO.storeObject per salvare le label. + + final File keyFile = File.createTempFile(ScatteredLabelledArcsASCIIGraph.class.getSimpleName(), "keys", tempDir); + keyFile.deleteOnExit(); + final File valueFile = File.createTempFile(ScatteredLabelledArcsASCIIGraph.class.getSimpleName(), "values", tempDir); + valueFile.deleteOnExit(); + + BinIO.storeLongs(map.key, 0, map.size(), keyFile); + BinIO.storeInts(map.value, 0, map.size(), valueFile); + + map = null; + + long[][] key = BinIO.loadLongsBig(keyFile); + keyFile.delete(); + int[][] value = BinIO.loadIntsBig(valueFile); + valueFile.delete(); + + if (function == null) { + this.ids = new long[numNodes]; + + final long[] result = new long[numNodes]; + for (int i = numNodes; i-- != 0; ) result[BigArrays.get(value, i)] = BigArrays.get(key, i); + this.ids = result; + } + + key = null; + value = null; + + this.arcLabelledBatchGraph = new Transform.ArcLabelledBatchGraph(function == null ? numNodes : n, pairs, batches, labelBatches, prototype); + } + + protected static void logBatches(final ObjectArrayList batches, final long pairs, final ProgressLogger pl) { + long length = 0; + for(final File f : batches) length += f.length(); + pl.logger().info("Created " + batches.size() + " batches using " + Util.format((double)Byte.SIZE * length / pairs) + " bits/arc."); + } + + private final static long getLong(final byte[] array, int offset, int length) { + if (length == 0) throw new NumberFormatException("Empty number"); + int sign = 1; + if (array[offset] == '-') { + sign = -1; + offset++; + length--; + } + + long value = 0; + for (int i = 0; i < length; i++) { + final byte digit = array[offset + i]; + if (digit < '0' || digit > '9') throw new NumberFormatException("Not a digit: " + (char) digit); + value *= 10; + value += digit - '0'; + } + + return sign * value; + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param arcs an iterator returning the arcs as two-element arrays. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + * @param tempDir a temporary directory for the batches, or null for {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final Iterator arcs, final Iterator

The actual table size will be the least power of two greater than - * expected/f. - * - * @param expected the expected number of elements in the set. - * @param f the load factor. - */ - public Long2IntOpenHashBigMap(final long expected, final float f) { - if (f <= 0 || f > 1) - throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); - if (this.n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); - this.f = f; - this.n = bigArraySize(expected, f); - this.maxFill = maxFill(this.n, f); - this.key = LongBigArrays.newBigArray(this.n); - this.value = IntBigArrays.newBigArray(this.n); - this.used = BooleanBigArrays.newBigArray(this.n); - this.initMasks(); - } - - /** - * Creates a new hash big set with initial expected {@link Hash#DEFAULT_INITIAL_SIZE} elements and - * {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. - */ - - public Long2IntOpenHashBigMap() { - this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); - } - - public int put(final long k, final int v) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int displ = (int)(h & this.segmentMask); - int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); - - // There's always an unused entry. - while (this.used[base][displ]) { - if (k == this.key[base][displ]) { - final int oldValue = this.value[base][displ]; - this.value[base][displ] = v; - return oldValue; - } - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - - this.used[base][displ] = true; - this.key[base][displ] = k; - this.value[base][displ] = v; - - if (++this.size >= this.maxFill) this.rehash(2 * this.n); - return -1; - } - - public int get(final long k) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int displ = (int)(h & this.segmentMask); - int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); - - // There's always an unused entry. - while (this.used[base][displ]) { - if (k == this.key[base][displ]) return this.value[base][displ]; - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - - return -1; - } - - private void rehash(final long newN) { - final boolean[][] used = this.used; - final long[][] key = this.key; - final int[][] value = this.value; - final boolean[][] newUsed = BooleanBigArrays.newBigArray(newN); - final long[][] newKey = LongBigArrays.newBigArray(newN); - final int[][] newValue = IntBigArrays.newBigArray(newN); - final long newMask = newN - 1; - final int newSegmentMask = newKey[0].length - 1; - final int newBaseMask = newKey.length - 1; - - int base = 0, displ = 0; - long h; - long k; - - for (long i = this.size; i-- != 0; ) { - - while (!used[base][displ]) base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); - - k = key[base][displ]; - h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int d = (int)(h & newSegmentMask); - int b = (int)((h & newMask) >>> BigArrays.SEGMENT_SHIFT); - - while (newUsed[b][d]) b = (b + ((d = (d + 1) & newSegmentMask) == 0 ? 1 : 0)) & newBaseMask; - - newUsed[b][d] = true; - newKey[b][d] = k; - newValue[b][d] = value[base][displ]; - - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); - } - - this.n = newN; - this.key = newKey; - this.value = newValue; - this.used = newUsed; - this.initMasks(); - this.maxFill = maxFill(this.n, this.f); - } - - public void compact() { - int base = 0, displ = 0, b = 0, d = 0; - for (long i = this.size; i-- != 0; ) { - while (!this.used[base][displ]) - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - this.key[b][d] = this.key[base][displ]; - this.value[b][d] = this.value[base][displ]; - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - b = (b + ((d = (d + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - } - - public long size() { - return this.size; - } - } - /** * Creates a scattered-arcs ASCII graph. * @@ -430,13 +230,6 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu this(is, function, labelPrototype, labelMapping, charset, n, symmetrize, noLoops, batchSize, tempDir, null); } - // TODO: Move somewhere else - // Given a label prototype and a value set the value inside the label without creating a new one - // it's like a setter, but for labels. - public interface LabelMapping { - void apply(Label prototype, String representation); - } - /** * Creates a scattered-arcs ASCII graph. * @@ -685,32 +478,6 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu this.arcLabelledBatchGraph = new Transform.ArcLabelledBatchGraph(function == null ? numNodes : n, pairs, batches, labelBatches, prototype); } - protected static void logBatches(final ObjectArrayList batches, final long pairs, final ProgressLogger pl) { - long length = 0; - for (final File f : batches) length += f.length(); - pl.logger().info("Created " + batches.size() + " batches using " + Util.format((double)Byte.SIZE * length / pairs) + " bits/arc."); - } - - private final static long getLong(final byte[] array, int offset, int length) { - if (length == 0) throw new NumberFormatException("Empty number"); - int sign = 1; - if (array[offset] == '-') { - sign = -1; - offset++; - length--; - } - - long value = 0; - for (int i = 0; i < length; i++) { - final byte digit = array[offset + i]; - if (digit < '0' || digit > '9') throw new NumberFormatException("Not a digit: " + (char)digit); - value *= 10; - value += digit - '0'; - } - - return sign * value; - } - /** * Creates a scattered-arcs ASCII graph. * @@ -832,6 +599,32 @@ public ScatteredLabelledArcsASCIIGraph(final Iterator arcs, final Iterat this.arcLabelledBatchGraph = new Transform.ArcLabelledBatchGraph(numNodes, pairs, batches, labelBatches, prototype); } + protected static void logBatches(final ObjectArrayList batches, final long pairs, final ProgressLogger pl) { + long length = 0; + for (final File f : batches) length += f.length(); + pl.logger().info("Created " + batches.size() + " batches using " + Util.format((double)Byte.SIZE * length / pairs) + " bits/arc."); + } + + private final static long getLong(final byte[] array, int offset, int length) { + if (length == 0) throw new NumberFormatException("Empty number"); + int sign = 1; + if (array[offset] == '-') { + sign = -1; + offset++; + length--; + } + + long value = 0; + for (int i = 0; i < length; i++) { + final byte digit = array[offset + i]; + if (digit < '0' || digit > '9') throw new NumberFormatException("Not a digit: " + (char)digit); + value *= 10; + value += digit - '0'; + } + + return sign * value; + } + @Override public int numNodes() { if (this.arcLabelledBatchGraph == null) @@ -866,6 +659,225 @@ public ScatteredLabelledArcsASCIIGraph copy() { return this; } + @Override + public String toString() { + final MutableString ms = new MutableString(); + ArcLabelledNodeIterator nodeIterator = nodeIterator(); + ms.append("Nodes: " + numNodes() + "\nArcs: " + numArcs() + "\n"); + while (nodeIterator.hasNext()) { + int node = nodeIterator.nextInt(); + Label[] labels = nodeIterator.labelArray(); + ms.append("Successors of " + node + " (degree " + nodeIterator.outdegree() + "):"); + for (int k = 0; k < nodeIterator.outdegree(); k++) { + ms.append(" " + node + " (" + labels[k].get() + ")"); + } + ms.append("\n"); + + } + return ms.toString(); + } + + // TODO: Move somewhere else + // Given a label prototype and a value set the value inside the label without creating a new one + // it's like a setter, but for labels. + public interface LabelMapping { + void apply(Label prototype, String representation); + } + + private static final class Long2IntOpenHashBigMap implements java.io.Serializable, Cloneable, Hash { + public static final long serialVersionUID = 0L; + /** + * The acceptable load factor. + */ + private final float f; + /** + * The big array of keys. + */ + public transient long[][] key; + /** + * The big array of values. + */ + public transient int[][] value; + /** + * The big array telling whether a position is used. + */ + private transient boolean[][] used; + /** + * The current table size (always a power of 2). + */ + private transient long n; + + /** + * Threshold after which we rehash. It must be the table size times {@link #f}. + */ + private transient long maxFill; + + /** + * The mask for wrapping a position counter. + */ + private transient long mask; + + /** + * The mask for wrapping a segment counter. + */ + private transient int segmentMask; + + /** + * The mask for wrapping a base counter. + */ + private transient int baseMask; + + /** + * Number of entries in the set. + */ + private long size; + + /** + * Creates a new hash big set. + * + *

The actual table size will be the least power of two greater than + * expected/f. + * + * @param expected the expected number of elements in the set. + * @param f the load factor. + */ + public Long2IntOpenHashBigMap(final long expected, final float f) { + if (f <= 0 || f > 1) + throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); + if (this.n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); + this.f = f; + this.n = bigArraySize(expected, f); + this.maxFill = maxFill(this.n, f); + this.key = LongBigArrays.newBigArray(this.n); + this.value = IntBigArrays.newBigArray(this.n); + this.used = BooleanBigArrays.newBigArray(this.n); + this.initMasks(); + } + + /** + * Creates a new hash big set with initial expected {@link Hash#DEFAULT_INITIAL_SIZE} elements and + * {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. + */ + + public Long2IntOpenHashBigMap() { + this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); + } + + /** + * Initialises the mask values. + */ + private void initMasks() { + this.mask = this.n - 1; + /* + * Note that either we have more than one segment, and in this case all segments are + * BigArrays.SEGMENT_SIZE long, or we have exactly one segment whose length is a power of + * two. + */ + this.segmentMask = this.key[0].length - 1; + this.baseMask = this.key.length - 1; + } + + public int put(final long k, final int v) { + final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int displ = (int)(h & this.segmentMask); + int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); + + // There's always an unused entry. + while (this.used[base][displ]) { + if (k == this.key[base][displ]) { + final int oldValue = this.value[base][displ]; + this.value[base][displ] = v; + return oldValue; + } + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + + this.used[base][displ] = true; + this.key[base][displ] = k; + this.value[base][displ] = v; + + if (++this.size >= this.maxFill) this.rehash(2 * this.n); + return -1; + } + + public int get(final long k) { + final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int displ = (int)(h & this.segmentMask); + int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); + + // There's always an unused entry. + while (this.used[base][displ]) { + if (k == this.key[base][displ]) return this.value[base][displ]; + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + + return -1; + } + + private void rehash(final long newN) { + final boolean[][] used = this.used; + final long[][] key = this.key; + final int[][] value = this.value; + final boolean[][] newUsed = BooleanBigArrays.newBigArray(newN); + final long[][] newKey = LongBigArrays.newBigArray(newN); + final int[][] newValue = IntBigArrays.newBigArray(newN); + final long newMask = newN - 1; + final int newSegmentMask = newKey[0].length - 1; + final int newBaseMask = newKey.length - 1; + + int base = 0, displ = 0; + long h; + long k; + + for (long i = this.size; i-- != 0; ) { + + while (!used[base][displ]) base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); + + k = key[base][displ]; + h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int d = (int)(h & newSegmentMask); + int b = (int)((h & newMask) >>> BigArrays.SEGMENT_SHIFT); + + while (newUsed[b][d]) b = (b + ((d = (d + 1) & newSegmentMask) == 0 ? 1 : 0)) & newBaseMask; + + newUsed[b][d] = true; + newKey[b][d] = k; + newValue[b][d] = value[base][displ]; + + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); + } + + this.n = newN; + this.key = newKey; + this.value = newValue; + this.used = newUsed; + this.initMasks(); + this.maxFill = maxFill(this.n, this.f); + } + + public void compact() { + int base = 0, displ = 0, b = 0, d = 0; + for (long i = this.size; i-- != 0; ) { + while (!this.used[base][displ]) + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + this.key[b][d] = this.key[base][displ]; + this.value[b][d] = this.value[base][displ]; + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + b = (b + ((d = (d + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + } + + public long size() { + return this.size; + } + } + /* @SuppressWarnings("unchecked") public static void main(final String[] args) throws IllegalArgumentException, SecurityException, IOException, JSAPException, ClassNotFoundException { String basename; From e0ae0c7f28ee739d3bc6b7ef782fa507918bbec3 Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Mon, 20 Feb 2023 15:53:54 +0100 Subject: [PATCH 09/85] fixed toString error --- .../webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java index a0353cd..72de9d2 100644 --- a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java @@ -666,10 +666,11 @@ public String toString() { ms.append("Nodes: " + numNodes() + "\nArcs: " + numArcs() + "\n"); while (nodeIterator.hasNext()) { int node = nodeIterator.nextInt(); + ArcLabelledNodeIterator.LabelledArcIterator successors = nodeIterator.successors(); Label[] labels = nodeIterator.labelArray(); ms.append("Successors of " + node + " (degree " + nodeIterator.outdegree() + "):"); for (int k = 0; k < nodeIterator.outdegree(); k++) { - ms.append(" " + node + " (" + labels[k].get() + ")"); + ms.append(" " + successors.nextInt() + " (" + labels[k].get() + ")"); } ms.append("\n"); From 10c115252086ee9b812eeb61c25ad143e571eae0 Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Mon, 20 Feb 2023 16:41:44 +0100 Subject: [PATCH 10/85] changed processTransposeBatch to prune duplicate arcs --- src/it/unimi/dsi/webgraph/Transform.java | 64 +++++++++++++++++------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/Transform.java b/src/it/unimi/dsi/webgraph/Transform.java index ff3c9df..4881a8e 100644 --- a/src/it/unimi/dsi/webgraph/Transform.java +++ b/src/it/unimi/dsi/webgraph/Transform.java @@ -1199,7 +1199,7 @@ public int[] successorArray() { final int numPairs = this.numPairs; // Neither quicksort nor heaps are stable, so we reestablish order here. IntArrays.quickSort(successor, 0, numPairs); - if (numPairs!= 0) { + if (numPairs != 0) { int p = 0; for (int j = 1; j < numPairs; j++) if (successor[p] != successor[j]) successor[++p] = successor[j]; outdegree = p + 1; @@ -1311,6 +1311,8 @@ class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator { private int last; /** The outdegree of the current node (valid if {@link #last} is not -1). */ private int outdegree; + /** The number of pairs associated with the current node (valid if {@link #last} is not -1). */ + private int numPairs; /** The successors of the current node (valid if {@link #last} is not -1); * only the first {@link #outdegree} entries are meaningful. */ private int[] successor; @@ -1319,7 +1321,7 @@ class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator { private Label[] label; public InternalArcLabelledNodeIterator(final int upperBound) throws IOException { - this(upperBound, null, null, null, null, null, -1, 0, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY); + this(upperBound, null, null, null, null, null, -1, -1, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY); } public InternalArcLabelledNodeIterator(final int upperBound, final InputBitStream[] baseIbs, final InputBitStream[] baseLabelInputBitStream, final int[] refArray, final int[] prevTarget, final int[] inputStreamLength, final int last, final int outdegree, final int successor[], final Label[] label) throws IOException { @@ -1377,8 +1379,10 @@ public boolean hasNext() { @Override public int nextInt() { + if (! hasNext()) throw new NoSuchElementException(); last++; int d = 0; + outdegree = -1; int i; try { @@ -1395,8 +1399,8 @@ public int nextInt() { if (--inputStreamLength[i] == 0) { queue.dequeue(); batchIbs[i].close(); - labelInputBitStream[i].close(); batchIbs[i] = null; + labelInputBitStream[i].close(); labelInputBitStream[i] = null; } else { @@ -1410,8 +1414,19 @@ public int nextInt() { } d++; } + + numPairs = d; + } + catch(final IOException e) { + e.printStackTrace(); + throw new RuntimeException(this + " " + e); + } + + // Compute outdegree + if (outdegree == -1) { + final int numPairs = this.numPairs; // Neither quicksort nor heaps are stable, so we reestablish order here. - it.unimi.dsi.fastutil.Arrays.quickSort(0, d, (x, y) -> Integer.compare(successor[x], successor[y]), + it.unimi.dsi.fastutil.Arrays.quickSort(0, numPairs, (x, y) -> Integer.compare(successor[x], successor[y]), (x, y) -> { final int t = successor[x]; successor[x] = successor[y]; @@ -1420,12 +1435,16 @@ public int nextInt() { label[x] = label[y]; label[y] = l; }); - } - catch(final IOException e) { - throw new RuntimeException(e); + + if (numPairs != 0) { + // Avoid returning the duplicate arcs + int p = 0; + for (int j = 1; j < numPairs; j++) if (successor[p] != successor[j]) successor[++p] = successor[j]; + outdegree = p + 1; + } + else outdegree = 0; } - outdegree = d; return last; } @@ -1604,6 +1623,12 @@ public static int processTransposeBatch(final int n, final int[] source, final i batchFile.deleteOnExit(); batches.add(batchFile); final OutputBitStream batch = new OutputBitStream(batchFile); + + final File labelFile = File.createTempFile("label-", ".bits", tempDir); + labelFile.deleteOnExit(); + labelBatches.add(labelFile); + final OutputBitStream labelObs = new OutputBitStream(labelFile); + int u = 0; if (n != 0) { @@ -1616,32 +1641,35 @@ public static int processTransposeBatch(final int n, final int[] source, final i batch.writeDelta(prevSource); batch.writeDelta(target[0]); + labelBitStream.position(start[0]); + prototype.fromBitStream(labelBitStream, source[0]); + prototype.toBitStream(labelObs, target[0]); + for(int i = 1; i < n; i++) { if (source[i] != prevSource) { batch.writeDelta(source[i] - prevSource); batch.writeDelta(target[i]); prevSource = source[i]; + + labelBitStream.position(start[i]); + prototype.fromBitStream(labelBitStream, source[i]); + prototype.toBitStream(labelObs, target[i]); } else if (target[i] != target[i - 1]) { // We don't write duplicate pairs batch.writeDelta(0); batch.writeDelta(target[i] - target[i - 1] - 1); + + labelBitStream.position(start[i]); + prototype.fromBitStream(labelBitStream, source[i]); + prototype.toBitStream(labelObs, target[i]); } } } + else batch.writeDelta(0); batch.close(); - - final File labelFile = File.createTempFile("label-", ".bits", tempDir); - labelFile.deleteOnExit(); - labelBatches.add(labelFile); - final OutputBitStream labelObs = new OutputBitStream(labelFile); - for (int i = 0; i < n; i++) { - labelBitStream.position(start[i]); - prototype.fromBitStream(labelBitStream, source[i]); - prototype.toBitStream(labelObs, target[i]); - } labelObs.close(); return u; From d654603baae28f02dcf39b41b261ef2cffb5447f Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Mon, 20 Feb 2023 16:43:00 +0100 Subject: [PATCH 11/85] minor refactoring and formatting --- .../ScatteredLabelledArcsASCIIGraphTest.java | 150 ++++++++---------- 1 file changed, 68 insertions(+), 82 deletions(-) diff --git a/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java b/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java index edb2d17..7b892b3 100644 --- a/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java +++ b/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java @@ -26,13 +26,12 @@ import org.junit.Test; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; -import static it.unimi.dsi.webgraph.labelling.ScatteredLabelledArcsASCIIGraph.*; +import static it.unimi.dsi.webgraph.labelling.ScatteredLabelledArcsASCIIGraph.LabelMapping; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; @@ -42,91 +41,97 @@ public class ScatteredLabelledArcsASCIIGraphTest extends WebGraphTestCase { // TODO: label tests + private static Iterator toArcsIterator(final String s) { + final String[] arcs = s.split("\n"); + final List arcSet = new ArrayList<>(); + for (final String arc : arcs) { + final String[] parts = arc.split(" "); + arcSet.add(new long[] {Long.parseLong(parts[0]), Long.parseLong(parts[1])}); + } + return arcSet.iterator(); + } + + private static Iterator

The actual table size will be the least power of two greater than + *

+ * The actual table size will be the least power of two greater than * expected/f. * - * @param expected the expected number of elements in the set. + * @param expected the expected number of elements in the map. * @param f the load factor. */ public Long2IntOpenHashBigMap(final long expected, final float f) { @@ -306,7 +307,17 @@ protected void rehash(final long newN) { maxFill = maxFill(n, f); } - public void compact() { + /** + * Assuming that the map is a minimal perfect hash, returns the list of keys in value order. + * + *

+ * The map is not usable after this call. + * + * @param tempDir a temporary directory for storing keys and values. + * @return the list of keys in value order. + */ + public long[] getIds(final File tempDir) throws IOException { + // Here we assume that the map is a minimal perfect hash int base = 0, displ = 0, b = 0, d = 0; for(long i = size; i-- != 0;) { while (! used[base][displ]) base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; @@ -315,6 +326,28 @@ public void compact() { base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; b = (b + ((d = (d + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; } + + // The following weird code minimizes memory usage + final File keyFile = File.createTempFile(ScatteredArcsASCIIGraph.class.getSimpleName(), "keys", tempDir); + keyFile.deleteOnExit(); + final File valueFile = File.createTempFile(ScatteredArcsASCIIGraph.class.getSimpleName(), "values", tempDir); + valueFile.deleteOnExit(); + + BinIO.storeLongs(key, 0, size(), keyFile); + BinIO.storeInts(value, 0, size(), valueFile); + + used = null; + key = null; + value = null; + + final long[][] key = BinIO.loadLongsBig(keyFile); + keyFile.delete(); + final int[][] value = BinIO.loadIntsBig(valueFile); + valueFile.delete(); + + final long[] result = new long[(int)size]; + for (int i = (int)size(); i-- != 0;) result[BigArrays.get(value, i)] = BigArrays.get(key, i); + return result; } public long size() { @@ -465,7 +498,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunction function, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); - Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); + final Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); int numNodes = -1; if (charset == null) charset = Charset.forName("ISO-8859-1"); @@ -624,34 +657,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunctionnull. */ public ScatteredArcsASCIIGraph(final Iterator arcs, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { - Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); + final Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); int numNodes = -1; @@ -742,32 +748,7 @@ public ScatteredArcsASCIIGraph(final Iterator arcs, final boolean symmet source = null; target = null; - map.compact(); - - final File keyFile = File.createTempFile(ScatteredArcsASCIIGraph.class.getSimpleName(), "keys", tempDir); - keyFile.deleteOnExit(); - final File valueFile = File.createTempFile(ScatteredArcsASCIIGraph.class.getSimpleName(), "values", tempDir); - valueFile.deleteOnExit(); - - BinIO.storeLongs(map.key, 0, map.size(), keyFile); - BinIO.storeInts(map.value, 0, map.size(), valueFile); - - map = null; - - long[][] key = BinIO.loadLongsBig(keyFile); - keyFile.delete(); - int[][] value = BinIO.loadIntsBig(valueFile); - valueFile.delete(); - - ids = new long[numNodes]; - - final long[] result = new long[numNodes]; - for(int i = numNodes; i--!= 0;) result[BigArrays.get(value, i)] = BigArrays.get(key, i); - ids = result; - - key = null; - value = null; - + ids = map.getIds(tempDir); batchGraph = new Transform.BatchGraph(numNodes, pairs, batches); } From d4ab00317bda26ef44eb73c00203af7eef707c62 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Wed, 22 Feb 2023 00:10:52 +0000 Subject: [PATCH 24/85] Reimplemented the id big map in the new fastutil style --- .../dsi/webgraph/ScatteredArcsASCIIGraph.java | 161 ++++++++---------- src/it/unimi/dsi/webgraph/Transform.java | 2 +- .../webgraph/ScatteredArcsASCIIGraphTest.java | 17 ++ 3 files changed, 88 insertions(+), 92 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java index 4fce3a7..961eb90 100644 --- a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java @@ -42,7 +42,6 @@ import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Hash; -import it.unimi.dsi.fastutil.booleans.BooleanBigArrays; import it.unimi.dsi.fastutil.bytes.ByteArrays; import it.unimi.dsi.fastutil.ints.IntBigArrays; import it.unimi.dsi.fastutil.io.BinIO; @@ -145,53 +144,45 @@ public class ScatteredArcsASCIIGraph extends ImmutableSequentialGraph { /** The list of identifiers in order of appearance. */ public long[] ids; - private static final class Long2IntOpenHashBigMap implements java.io.Serializable, Cloneable, Hash { - public static final long serialVersionUID = 0L; - + public static class ID2NodeMap implements Hash { /** The big array of keys. */ - public transient long[][] key; + protected long[][] key; /** The big array of values. */ - public transient int[][] value; + protected int[][] value; - /** The big array telling whether a position is used. */ - protected transient boolean[][] used; + /** Whether the zero key is present (the value is stored in position {@link #n). */ + protected boolean containsZeroKey; /** The acceptable load factor. */ protected final float f; /** The current table size (always a power of 2). */ - protected transient long n; + protected long n; /** Threshold after which we rehash. It must be the table size times {@link #f}. */ - protected transient long maxFill; + protected long maxFill; /** The mask for wrapping a position counter. */ - protected transient long mask; + protected long mask; /** The mask for wrapping a segment counter. */ - protected transient int segmentMask; + protected int segmentMask; /** The mask for wrapping a base counter. */ - protected transient int baseMask; + protected int baseMask; /** Number of entries in the set. */ - protected long size; + protected int size; - /** Initialises the mask values. */ private void initMasks() { mask = n - 1; - /* - * Note that either we have more than one segment, and in this case all segments are - * BigArrays.SEGMENT_SIZE long, or we have exactly one segment whose length is a power of - * two. - */ - segmentMask = key[0].length - 1; baseMask = key.length - 1; + segmentMask = baseMask == 0 ? (int)(n - 1) : BigArrays.SEGMENT_SIZE - 1; } /** - * Creates a new hash big map. + * Creates a new map based on a hash table. * *

* The actual table size will be the least power of two greater than @@ -200,15 +191,14 @@ private void initMasks() { * @param expected the expected number of elements in the map. * @param f the load factor. */ - public Long2IntOpenHashBigMap(final long expected, final float f) { + public ID2NodeMap(final long expected, final float f) { if (f <= 0 || f > 1) throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); if (n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); this.f = f; n = bigArraySize(expected, f); maxFill = maxFill(n, f); - key = LongBigArrays.newBigArray(n); - value = IntBigArrays.newBigArray(n); - used = BooleanBigArrays.newBigArray(n); + key = LongBigArrays.newBigArray(n + 1); + value = IntBigArrays.newBigArray(n + 1); initMasks(); } @@ -217,116 +207,110 @@ public Long2IntOpenHashBigMap(final long expected, final float f) { * and {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. */ - public Long2IntOpenHashBigMap() { + public ID2NodeMap() { this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); } - public int put(final long k, final int v) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int displ = (int)(h & segmentMask); - int base = (int)((h & mask) >>> BigArrays.SEGMENT_SHIFT); - - // There's always an unused entry. - while (used[base][displ]) { - if (k == key[base][displ]) { - final int oldValue = value[base][displ]; - value[base][displ] = v; - return oldValue; - } - base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; - } - - used[base][displ] = true; - key[base][displ] = k; - value[base][displ] = v; + /** + * Returns the node associated with a given identifier, assigning a new one if necessary. + * + * @param id an identifier. + * @return the associated node. + */ + public int getNode(final long id) { + if (id == 0) { + if (containsZeroKey) return BigArrays.get(value, n); + BigArrays.set(value, n, size); + containsZeroKey = true; + } else { - if (++size >= maxFill) rehash(2 * n); - return -1; - } + final long h = it.unimi.dsi.fastutil.HashCommon.mix(id); - public int get(final long k) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + // The starting point. + int displ = (int)(h & segmentMask); + int base = (int)((h & mask) >>> BigArrays.SEGMENT_SHIFT); - // The starting point. - int displ = (int)(h & segmentMask); - int base = (int)((h & mask) >>> BigArrays.SEGMENT_SHIFT); + // There's always an unused entry. + while (key[base][displ] != 0) { + if (id == key[base][displ]) return value[base][displ]; + base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; + } - // There's always an unused entry. - while (used[base][displ]) { - if (k == key[base][displ]) return value[base][displ]; - base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; + key[base][displ] = id; + value[base][displ] = size; } - return -1; + if (++size >= maxFill) rehash(2 * n); + return size - 1; } protected void rehash(final long newN) { - final boolean used[][] = this.used; final long key[][] = this.key; final int[][] value = this.value; - final boolean newUsed[][] = BooleanBigArrays.newBigArray(newN); - final long newKey[][] = LongBigArrays.newBigArray(newN); - final int newValue[][] = IntBigArrays.newBigArray(newN); + final long newKey[][] = LongBigArrays.newBigArray(newN + 1); + final int newValue[][] = IntBigArrays.newBigArray(newN + 1); final long newMask = newN - 1; - final int newSegmentMask = newKey[0].length - 1; final int newBaseMask = newKey.length - 1; + final int newSegmentMask = newBaseMask == 0 ? (int)(newN - 1) : BigArrays.SEGMENT_SIZE - 1; + final int realSize = containsZeroKey ? size - 1 : size; int base = 0, displ = 0; - long h; - long k; - - for (long i = size; i-- != 0;) { - while (!used[base][displ]) + for (int i = realSize; i-- != 0;) { + while (key[base][displ] == 0) base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)); - k = key[base][displ]; - h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + final long k = key[base][displ]; + final long h = it.unimi.dsi.fastutil.HashCommon.mix(k); // The starting point. int d = (int)(h & newSegmentMask); int b = (int)((h & newMask) >>> BigArrays.SEGMENT_SHIFT); - while (newUsed[b][d]) + while (newKey[b][d] != 0) b = (b + ((d = (d + 1) & newSegmentMask) == 0 ? 1 : 0)) & newBaseMask; - newUsed[b][d] = true; newKey[b][d] = k; newValue[b][d] = value[base][displ]; base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)); } + BigArrays.set(newValue, newN, BigArrays.get(value, n)); + this.n = newN; this.key = newKey; this.value = newValue; - this.used = newUsed; initMasks(); maxFill = maxFill(n, f); } /** - * Assuming that the map is a minimal perfect hash, returns the list of keys in value order. + * Returns the id list in order of appearance as an array. * *

* The map is not usable after this call. * * @param tempDir a temporary directory for storing keys and values. - * @return the list of keys in value order. + * @return the id list in order of appearance. */ public long[] getIds(final File tempDir) throws IOException { // Here we assume that the map is a minimal perfect hash + final int realSize = containsZeroKey ? size - 1 : size; int base = 0, displ = 0, b = 0, d = 0; - for(long i = size; i-- != 0;) { - while (! used[base][displ]) base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; + for (int i = realSize; i-- != 0;) { + while (key[base][displ] == 0) base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; key[b][d] = key[base][displ]; value[b][d] = value[base][displ]; base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; b = (b + ((d = (d + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; } + if (containsZeroKey) { + key[b][d] = 0; + value[b][d] = BigArrays.get(value, n); + } + // The following weird code minimizes memory usage final File keyFile = File.createTempFile(ScatteredArcsASCIIGraph.class.getSimpleName(), "keys", tempDir); keyFile.deleteOnExit(); @@ -336,7 +320,6 @@ public long[] getIds(final File tempDir) throws IOException { BinIO.storeLongs(key, 0, size(), keyFile); BinIO.storeInts(value, 0, size(), valueFile); - used = null; key = null; value = null; @@ -345,7 +328,7 @@ public long[] getIds(final File tempDir) throws IOException { final int[][] value = BinIO.loadIntsBig(valueFile); valueFile.delete(); - final long[] result = new long[(int)size]; + final long[] result = new long[size]; for (int i = (int)size(); i-- != 0;) result[BigArrays.get(value, i)] = BigArrays.get(key, i); return result; } @@ -498,7 +481,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunction function, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); - final Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); + final ID2NodeMap map = new ID2NodeMap(); int numNodes = -1; if (charset == null) charset = Charset.forName("ISO-8859-1"); @@ -556,8 +539,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunction " + s); } @@ -599,8 +581,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunction " + t); } @@ -691,7 +672,7 @@ private final static long getLong(final byte[] array, int offset, int length) { * @param pl a progress logger, or null. */ public ScatteredArcsASCIIGraph(final Iterator arcs, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { - final Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); + final ID2NodeMap map = new ID2NodeMap(); int numNodes = -1; @@ -709,11 +690,9 @@ public ScatteredArcsASCIIGraph(final Iterator arcs, final boolean symmet while(arcs.hasNext()) { final long[] arc = arcs.next(); final long sl = arc[0]; - int s = map.get(sl); - if (s == -1) map.put(sl, s = (int)map.size()); + final int s = map.getNode(sl); final long tl = arc[1]; - int t = map.get(tl); - if (t == -1) map.put(tl, t = (int)map.size()); + final int t = map.getNode(tl); if (s != t || ! noLoops) { source[j] = s; diff --git a/src/it/unimi/dsi/webgraph/Transform.java b/src/it/unimi/dsi/webgraph/Transform.java index 900c576..45cc8b5 100644 --- a/src/it/unimi/dsi/webgraph/Transform.java +++ b/src/it/unimi/dsi/webgraph/Transform.java @@ -656,7 +656,7 @@ public static ImmutableGraph map(final ImmutableGraph g, final int map[], final if (! g.randomAccess()) throw new IllegalArgumentException("Graph mapping requires random access"); final int sourceNumNodes = g.numNodes(); - if (map.length != sourceNumNodes) throw new IllegalArgumentException("The graph to be mapped has " + sourceNumNodes + " whereas the map contains " + map.length + " entries"); + if (map.length != sourceNumNodes) throw new IllegalArgumentException("The graph to be mapped has " + sourceNumNodes + " nodes whereas the map contains " + map.length + " entries"); int max = -1; if (pl != null) { diff --git a/test/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraphTest.java b/test/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraphTest.java index 82c39be..61a368a 100644 --- a/test/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraphTest.java +++ b/test/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraphTest.java @@ -32,6 +32,7 @@ import it.unimi.dsi.fastutil.io.FastByteArrayInputStream; import it.unimi.dsi.fastutil.objects.Object2LongArrayMap; import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.webgraph.examples.ErdosRenyiGraph; public class ScatteredArcsASCIIGraphTest extends WebGraphTestCase { @@ -207,4 +208,20 @@ public void testConstructorWithArray() throws IOException { } + @Test + public void testLarge() throws IOException { + final ImmutableGraph erdosRenyiGraph = new ErdosRenyiGraph(100000, .0005, 0, true); + final StringBuilder b = new StringBuilder(); + for (final NodeIterator nodeIterator = erdosRenyiGraph.nodeIterator(); nodeIterator.hasNext();) { + final int curr = nodeIterator.nextInt(); + final LazyIntIterator successors = nodeIterator.successors(); + for (int s; (s = successors.nextInt()) != -1;) b.append(-curr).append('\t').append(-s).append('\n'); + } + + final ScatteredArcsASCIIGraph g = new ScatteredArcsASCIIGraph(new FastByteArrayInputStream(b.toString().getBytes("ASCII")), false, false, 10000, null, null); + final int[] perm = new int[g.numNodes()]; + for (int i = 0; i < g.numNodes(); i++) perm[i] = (int)-g.ids[i]; + + assertEquals(erdosRenyiGraph, Transform.map(new ArrayListMutableGraph(g).immutableView(), perm)); + } } From 34e8b8605c964bd50f7c96d0b84b146236975ed4 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Wed, 22 Feb 2023 00:29:19 +0000 Subject: [PATCH 25/85] Better name --- src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java index 961eb90..2847c97 100644 --- a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java @@ -144,7 +144,7 @@ public class ScatteredArcsASCIIGraph extends ImmutableSequentialGraph { /** The list of identifiers in order of appearance. */ public long[] ids; - public static class ID2NodeMap implements Hash { + public static class Id2NodeMap implements Hash { /** The big array of keys. */ protected long[][] key; @@ -191,7 +191,7 @@ private void initMasks() { * @param expected the expected number of elements in the map. * @param f the load factor. */ - public ID2NodeMap(final long expected, final float f) { + public Id2NodeMap(final long expected, final float f) { if (f <= 0 || f > 1) throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); if (n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); this.f = f; @@ -207,7 +207,7 @@ public ID2NodeMap(final long expected, final float f) { * and {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. */ - public ID2NodeMap() { + public Id2NodeMap() { this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); } @@ -481,7 +481,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunction function, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); - final ID2NodeMap map = new ID2NodeMap(); + final Id2NodeMap map = new Id2NodeMap(); int numNodes = -1; if (charset == null) charset = Charset.forName("ISO-8859-1"); @@ -672,7 +672,7 @@ private final static long getLong(final byte[] array, int offset, int length) { * @param pl a progress logger, or null. */ public ScatteredArcsASCIIGraph(final Iterator arcs, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { - final ID2NodeMap map = new ID2NodeMap(); + final Id2NodeMap map = new Id2NodeMap(); int numNodes = -1; From 98dc8642b8aa89c324a4653e7d98b73598b8a263 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Wed, 22 Feb 2023 08:29:16 +0000 Subject: [PATCH 26/85] Better handling of the zero key --- .../dsi/webgraph/ScatteredArcsASCIIGraph.java | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java index 2847c97..55b2427 100644 --- a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java @@ -151,9 +151,12 @@ public static class Id2NodeMap implements Hash { /** The big array of values. */ protected int[][] value; - /** Whether the zero key is present (the value is stored in position {@link #n). */ + /** Whether the zero key is present (the value is stored in {@link #zeroValue). */ protected boolean containsZeroKey; + /** The value associated with the zero key, if {@link #containsZeroKey}. */ + protected int zeroValue; + /** The acceptable load factor. */ protected final float f; @@ -178,7 +181,10 @@ public static class Id2NodeMap implements Hash { private void initMasks() { mask = n - 1; baseMask = key.length - 1; - segmentMask = baseMask == 0 ? (int)(n - 1) : BigArrays.SEGMENT_SIZE - 1; + /* Note that either we have more than one segment, and in this case all segments + * are BigArrays.SEGMENT_SIZE long, or we have exactly one segment whose length + * is a power of two. */ + segmentMask = key[0].length - 1; } /** @@ -197,8 +203,8 @@ public Id2NodeMap(final long expected, final float f) { this.f = f; n = bigArraySize(expected, f); maxFill = maxFill(n, f); - key = LongBigArrays.newBigArray(n + 1); - value = IntBigArrays.newBigArray(n + 1); + key = LongBigArrays.newBigArray(n); + value = IntBigArrays.newBigArray(n); initMasks(); } @@ -219,8 +225,8 @@ public Id2NodeMap() { */ public int getNode(final long id) { if (id == 0) { - if (containsZeroKey) return BigArrays.get(value, n); - BigArrays.set(value, n, size); + if (containsZeroKey) return zeroValue; + zeroValue = size; containsZeroKey = true; } else { @@ -247,11 +253,11 @@ public int getNode(final long id) { protected void rehash(final long newN) { final long key[][] = this.key; final int[][] value = this.value; - final long newKey[][] = LongBigArrays.newBigArray(newN + 1); - final int newValue[][] = IntBigArrays.newBigArray(newN + 1); + final long newKey[][] = LongBigArrays.newBigArray(newN); + final int newValue[][] = IntBigArrays.newBigArray(newN); final long newMask = newN - 1; final int newBaseMask = newKey.length - 1; - final int newSegmentMask = newBaseMask == 0 ? (int)(newN - 1) : BigArrays.SEGMENT_SIZE - 1; + final int newSegmentMask = newKey[0].length - 1; final int realSize = containsZeroKey ? size - 1 : size; int base = 0, displ = 0; @@ -276,8 +282,6 @@ protected void rehash(final long newN) { base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)); } - BigArrays.set(newValue, newN, BigArrays.get(value, n)); - this.n = newN; this.key = newKey; this.value = newValue; @@ -308,7 +312,7 @@ public long[] getIds(final File tempDir) throws IOException { if (containsZeroKey) { key[b][d] = 0; - value[b][d] = BigArrays.get(value, n); + value[b][d] = zeroValue; } // The following weird code minimizes memory usage From 1dbbcc1f3b78ad8ddd9f5bcf78fd3954e7cc0b9b Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Tue, 21 Feb 2023 21:45:56 +0000 Subject: [PATCH 27/85] Towards reducing class duplication --- src/it/unimi/dsi/webgraph/EFGraph.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/EFGraph.java b/src/it/unimi/dsi/webgraph/EFGraph.java index ca85a89..1aa9801 100644 --- a/src/it/unimi/dsi/webgraph/EFGraph.java +++ b/src/it/unimi/dsi/webgraph/EFGraph.java @@ -637,14 +637,14 @@ public static EFGraph loadOffline(final CharSequence basename) throws IOExceptio return EFGraph.loadMapped(basename, null); } - /** An iterator returning the offsets. */ - private final static class OffsetsLongIterator implements LongIterator { + /** An iterator returning offsets by reading δ-encoded gaps. */ + public final static class OffsetsLongIterator implements LongIterator { private final InputBitStream offsetIbs; private final long n; private long offset; private long i; - private OffsetsLongIterator(final InputBitStream offsetIbs, final long n) { + public OffsetsLongIterator(final InputBitStream offsetIbs, final long n) { this.offsetIbs = offsetIbs; this.n = n; } From fc5750e8a2b0256c0b4915f367c20de2f192bbd6 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Tue, 21 Feb 2023 21:46:25 +0000 Subject: [PATCH 28/85] Towards reducing class duplication; ported back main method --- .../BitStreamArcLabelledImmutableGraph.java | 158 +++++++++++++++--- 1 file changed, 134 insertions(+), 24 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/labelling/BitStreamArcLabelledImmutableGraph.java b/src/it/unimi/dsi/webgraph/labelling/BitStreamArcLabelledImmutableGraph.java index 1dd548a..cbf6980 100644 --- a/src/it/unimi/dsi/webgraph/labelling/BitStreamArcLabelledImmutableGraph.java +++ b/src/it/unimi/dsi/webgraph/labelling/BitStreamArcLabelledImmutableGraph.java @@ -25,9 +25,23 @@ import java.io.PrintWriter; import java.nio.channels.FileChannel; import java.util.Properties; +import java.util.concurrent.TimeUnit; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.martiansoftware.jsap.FlaggedOption; +import com.martiansoftware.jsap.JSAP; +import com.martiansoftware.jsap.JSAPException; +import com.martiansoftware.jsap.JSAPResult; +import com.martiansoftware.jsap.Parameter; +import com.martiansoftware.jsap.SimpleJSAP; +import com.martiansoftware.jsap.Switch; +import com.martiansoftware.jsap.UnflaggedOption; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.io.FastMultiByteArrayInputStream; +import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.longs.LongIterator; import it.unimi.dsi.fastutil.objects.ObjectArrays; import it.unimi.dsi.io.ByteBufferInputStream; @@ -35,6 +49,7 @@ import it.unimi.dsi.io.OutputBitStream; import it.unimi.dsi.lang.ObjectParser; import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.sux4j.util.EliasFanoMonotoneBigLongBigList; import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList; import it.unimi.dsi.webgraph.AbstractLazyIntIterator; import it.unimi.dsi.webgraph.BVGraph; @@ -119,10 +134,13 @@ */ public class BitStreamArcLabelledImmutableGraph extends ArcLabelledImmutableGraph { + private static final Logger LOGGER = LoggerFactory.getLogger(BitStreamArcLabelledImmutableGraph.class); /** The standard extension for the labels bit stream. */ public static final String LABELS_EXTENSION = ".labels"; /** The standard extension for the label offsets bit stream. */ public static final String LABEL_OFFSETS_EXTENSION = ".labeloffsets"; + /** The standard extension for the cached {@link LongBigList} containing the label offsets. */ + public static final String LABEL_OFFSETS_BIG_LIST_EXTENSION = ".labelobl"; /** The standard property key for a label specification. */ public static final String LABELSPEC_PROPERTY_KEY = "labelspec"; @@ -145,7 +163,7 @@ public class BitStreamArcLabelledImmutableGraph extends ArcLabelledImmutableGrap /** The basename of this graph (required for offline access). */ protected final CharSequence basename; /** The offset array, or null for sequential access. */ - protected final EliasFanoMonotoneLongBigList offset; + protected final LongBigList offset; /** * Builds a new labelled graph using a bit stream of labels. @@ -161,7 +179,7 @@ public class BitStreamArcLabelledImmutableGraph extends ArcLabelledImmutableGrap * null, this memory-mapped stream is used as the bit stream of labels. * @param offset the offset array for random access, or null. */ - protected BitStreamArcLabelledImmutableGraph(final CharSequence basename, final ImmutableGraph g, final Label prototype, final byte[] byteArray, final FastMultiByteArrayInputStream labelStream, final ByteBufferInputStream mappedLabelStream, final EliasFanoMonotoneLongBigList offset) { + protected BitStreamArcLabelledImmutableGraph(final CharSequence basename, final ImmutableGraph g, final Label prototype, final byte[] byteArray, final FastMultiByteArrayInputStream labelStream, final ByteBufferInputStream mappedLabelStream, final LongBigList offset) { this.g = g; this.byteArray = byteArray; this.labelStream = labelStream; @@ -311,6 +329,34 @@ public static BitStreamArcLabelledImmutableGraph load(final CharSequence basenam return load(LoadMethod.STANDARD, basename, pl); } + /** An iterator returning the label offsets by reading γ-encoded gaps. */ + public final static class LabelOffsetsLongIterator implements LongIterator { + private final InputBitStream offsetStream; + private final long n; + private long off; + private long i; + + public LabelOffsetsLongIterator(final long n, final InputBitStream offsetIbs) { + this.offsetStream = offsetIbs; + this.n = n; + } + + @Override + public boolean hasNext() { + return i <= n; + } + + @Override + public long nextLong() { + i++; + try { + return off = offsetStream.readLongGamma() + off; + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + } + /** Loads a labelled graph using the given method. * * @param method a load method. @@ -364,7 +410,7 @@ protected static BitStreamArcLabelledImmutableGraph load(final LoadMethod method byte[] byteArray = null; FastMultiByteArrayInputStream labelStream = null; ByteBufferInputStream mappedLabelStream = null; - EliasFanoMonotoneLongBigList offsets = null; + LongBigList offsets = null; if (method != LoadMethod.OFFLINE) { if (pl != null) { @@ -392,33 +438,27 @@ protected static BitStreamArcLabelledImmutableGraph load(final LoadMethod method pl.expectedUpdates = g.numNodes() + 1; pl.start("Loading label offsets..."); } - final InputBitStream offsetStream = new InputBitStream(basename + LABEL_OFFSETS_EXTENSION); - - offsets = new EliasFanoMonotoneLongBigList(g.numNodes() + 1, size * Byte.SIZE + 1, new LongIterator() { - private long off; - private int i; - - @Override - public boolean hasNext() { - return i <= g.numNodes(); + final File offsetsBigListFile = new File(basename + LABEL_OFFSETS_BIG_LIST_EXTENSION); + if (offsetsBigListFile.exists()) { + try { + offsets = (LongBigList)BinIO.loadObject(offsetsBigListFile); } - @Override - public long nextLong() { - i++; - try { - return off = offsetStream.readLongGamma() + off; - } - catch (final IOException e) { - throw new RuntimeException(e); + catch (final ClassNotFoundException e) { + if (pl != null) { + LOGGER.warn("A cached long big list of offsets was found, but its class is unknown", e); } } - }); - - offsetStream.close(); + } + if (offsets == null) { + final InputBitStream offsetStream = new InputBitStream(basename + LABEL_OFFSETS_EXTENSION); + offsets = (EliasFanoMonotoneLongBigList.fits(g.numNodes() + 1, size * Byte.SIZE + 1)) ? new EliasFanoMonotoneLongBigList(g.numNodes() + 1, size * Byte.SIZE + 1, new LabelOffsetsLongIterator(g.numNodes(), offsetStream)) : new EliasFanoMonotoneBigLongBigList(g.numNodes() + 1, size * Byte.SIZE + 1, new LabelOffsetsLongIterator(g.numNodes(), offsetStream)); + offsetStream.close(); + } if (pl != null) { pl.count = g.numNodes() + 1; pl.done(); - pl.logger().info("Label pointer bits per node: " + offsets.numBits() / (g.numNodes() + 1.0)); + final long offsetsNumBits = (offsets instanceof EliasFanoMonotoneLongBigList) ? ((EliasFanoMonotoneLongBigList)offsets).numBits() : ((EliasFanoMonotoneBigLongBigList)offsets).numBits(); + pl.logger().info("Label pointer bits per node: " + offsetsNumBits / (g.numNodes() + 1.0)); } } @@ -637,4 +677,74 @@ public static void saveProperties(final Label prototype, final CharSequence base properties.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + prototype.toSpec()); properties.close(); } + + /** An iterator returning γ-encoded offsets. */ + public final static class OffsetsLongIterator implements LongIterator { + private final InputBitStream offsetStream; + private final long n; + private long off; + private long i; + + public OffsetsLongIterator(final long n, final InputBitStream offsetIbs) { + this.offsetStream = offsetIbs; + this.n = n; + } + + @Override + public boolean hasNext() { + return i <= n; + } + + @Override + public long nextLong() { + i++; + try { + return off = offsetStream.readLongGamma() + off; + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + } + + /** + * Reads an arc-labelled immutable graph and stores it as a + * {@link BitStreamArcLabelledImmutableGraph}. + */ + public static void main(final String[] args) throws JSAPException, IOException { + final SimpleJSAP jsap = new SimpleJSAP(BVGraph.class.getName(), "Write an ArcLabelledGraph as a BitStreamArcLabelledImmutableGraph. Source and destination are basenames from which suitable filenames will be stemmed.", new Parameter[] { + new Switch("list", 'L', "list", "Precomputes an Elias-Fano list of offsets for the source labels."), + new FlaggedOption("underlyingBasename", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'u', "underlying", "The basename of the underlying graph"), + new UnflaggedOption("sourceBasename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the source graph, or a source spec if --spec was given; it is immaterial when --once is specified."), + new UnflaggedOption("destBasename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "The basename of the destination graph; if omitted, no recompression is performed. This is useful in conjunction with --offsets and --list."), }); + + final JSAPResult jsapResult = jsap.parse(args); + if (jsap.messagePrinted()) System.exit(1); + + final boolean list = jsapResult.getBoolean("list"); + final String source = jsapResult.getString("sourceBasename"); + final String dest = jsapResult.getString("destBasename"); + final String underlying = jsapResult.getString("underlyingBasename"); + + final ProgressLogger pl = new ProgressLogger(LOGGER, 10, TimeUnit.SECONDS); + final ArcLabelledImmutableGraph graph = ArcLabelledImmutableGraph.loadOffline(source, pl); + + if (dest != null) { + if (list) throw new IllegalArgumentException("You cannot specify a destination graph with these options"); + if (underlying == null) throw new IllegalArgumentException("You must specify an underlying graph with --underlying if you want to store a BitStreamArcLabelledImmutableGraph"); + BitStreamArcLabelledImmutableGraph.store(graph, dest, underlying, pl); + } else { + if (list) { + final FileInputStream fis = new FileInputStream(source + LABELS_EXTENSION); + final long size = fis.getChannel().size(); + final ImmutableGraph g = ImmutableGraph.loadOffline(source, pl); + final InputBitStream offsetStream = new InputBitStream(source + LABEL_OFFSETS_EXTENSION); + final LongBigList offsets = (EliasFanoMonotoneLongBigList.fits(g.numNodes() + 1, size * Byte.SIZE + 1)) ? new EliasFanoMonotoneLongBigList(g.numNodes() + 1, size * Byte.SIZE + 1, new OffsetsLongIterator(g.numNodes(), offsetStream)) : new EliasFanoMonotoneBigLongBigList(g.numNodes() + 1, size * Byte.SIZE + 1, new OffsetsLongIterator(g.numNodes(), offsetStream)); + offsetStream.close(); + fis.close(); + BinIO.storeObject(offsets, g.basename() + LABEL_OFFSETS_BIG_LIST_EXTENSION); + } else { + throw new IllegalArgumentException("You must specify a destination graph."); + } + } + } } From 9ffc0b94edae386dcc0cc182ef04685f855aed13 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Tue, 21 Feb 2023 21:46:58 +0000 Subject: [PATCH 29/85] Abstracted id code --- .../dsi/webgraph/ScatteredArcsASCIIGraph.java | 101 +++++++----------- 1 file changed, 41 insertions(+), 60 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java index 1208360..4fce3a7 100644 --- a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java @@ -191,12 +191,13 @@ private void initMasks() { } /** - * Creates a new hash big set. + * Creates a new hash big map. * - *

The actual table size will be the least power of two greater than + *

+ * The actual table size will be the least power of two greater than * expected/f. * - * @param expected the expected number of elements in the set. + * @param expected the expected number of elements in the map. * @param f the load factor. */ public Long2IntOpenHashBigMap(final long expected, final float f) { @@ -306,7 +307,17 @@ protected void rehash(final long newN) { maxFill = maxFill(n, f); } - public void compact() { + /** + * Assuming that the map is a minimal perfect hash, returns the list of keys in value order. + * + *

+ * The map is not usable after this call. + * + * @param tempDir a temporary directory for storing keys and values. + * @return the list of keys in value order. + */ + public long[] getIds(final File tempDir) throws IOException { + // Here we assume that the map is a minimal perfect hash int base = 0, displ = 0, b = 0, d = 0; for(long i = size; i-- != 0;) { while (! used[base][displ]) base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; @@ -315,6 +326,28 @@ public void compact() { base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; b = (b + ((d = (d + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; } + + // The following weird code minimizes memory usage + final File keyFile = File.createTempFile(ScatteredArcsASCIIGraph.class.getSimpleName(), "keys", tempDir); + keyFile.deleteOnExit(); + final File valueFile = File.createTempFile(ScatteredArcsASCIIGraph.class.getSimpleName(), "values", tempDir); + valueFile.deleteOnExit(); + + BinIO.storeLongs(key, 0, size(), keyFile); + BinIO.storeInts(value, 0, size(), valueFile); + + used = null; + key = null; + value = null; + + final long[][] key = BinIO.loadLongsBig(keyFile); + keyFile.delete(); + final int[][] value = BinIO.loadIntsBig(valueFile); + valueFile.delete(); + + final long[] result = new long[(int)size]; + for (int i = (int)size(); i-- != 0;) result[BigArrays.get(value, i)] = BigArrays.get(key, i); + return result; } public long size() { @@ -465,7 +498,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunction function, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); - Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); + final Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); int numNodes = -1; if (charset == null) charset = Charset.forName("ISO-8859-1"); @@ -624,34 +657,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunctionnull. */ public ScatteredArcsASCIIGraph(final Iterator arcs, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { - Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); + final Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); int numNodes = -1; @@ -742,32 +748,7 @@ public ScatteredArcsASCIIGraph(final Iterator arcs, final boolean symmet source = null; target = null; - map.compact(); - - final File keyFile = File.createTempFile(ScatteredArcsASCIIGraph.class.getSimpleName(), "keys", tempDir); - keyFile.deleteOnExit(); - final File valueFile = File.createTempFile(ScatteredArcsASCIIGraph.class.getSimpleName(), "values", tempDir); - valueFile.deleteOnExit(); - - BinIO.storeLongs(map.key, 0, map.size(), keyFile); - BinIO.storeInts(map.value, 0, map.size(), valueFile); - - map = null; - - long[][] key = BinIO.loadLongsBig(keyFile); - keyFile.delete(); - int[][] value = BinIO.loadIntsBig(valueFile); - valueFile.delete(); - - ids = new long[numNodes]; - - final long[] result = new long[numNodes]; - for(int i = numNodes; i--!= 0;) result[BigArrays.get(value, i)] = BigArrays.get(key, i); - ids = result; - - key = null; - value = null; - + ids = map.getIds(tempDir); batchGraph = new Transform.BatchGraph(numNodes, pairs, batches); } From 97c6ce4ce57d12421237da07277fb19279ac9a90 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Wed, 22 Feb 2023 00:10:52 +0000 Subject: [PATCH 30/85] Reimplemented the id big map in the new fastutil style --- .../dsi/webgraph/ScatteredArcsASCIIGraph.java | 161 ++++++++---------- src/it/unimi/dsi/webgraph/Transform.java | 2 +- .../webgraph/ScatteredArcsASCIIGraphTest.java | 17 ++ 3 files changed, 88 insertions(+), 92 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java index 4fce3a7..961eb90 100644 --- a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java @@ -42,7 +42,6 @@ import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Hash; -import it.unimi.dsi.fastutil.booleans.BooleanBigArrays; import it.unimi.dsi.fastutil.bytes.ByteArrays; import it.unimi.dsi.fastutil.ints.IntBigArrays; import it.unimi.dsi.fastutil.io.BinIO; @@ -145,53 +144,45 @@ public class ScatteredArcsASCIIGraph extends ImmutableSequentialGraph { /** The list of identifiers in order of appearance. */ public long[] ids; - private static final class Long2IntOpenHashBigMap implements java.io.Serializable, Cloneable, Hash { - public static final long serialVersionUID = 0L; - + public static class ID2NodeMap implements Hash { /** The big array of keys. */ - public transient long[][] key; + protected long[][] key; /** The big array of values. */ - public transient int[][] value; + protected int[][] value; - /** The big array telling whether a position is used. */ - protected transient boolean[][] used; + /** Whether the zero key is present (the value is stored in position {@link #n). */ + protected boolean containsZeroKey; /** The acceptable load factor. */ protected final float f; /** The current table size (always a power of 2). */ - protected transient long n; + protected long n; /** Threshold after which we rehash. It must be the table size times {@link #f}. */ - protected transient long maxFill; + protected long maxFill; /** The mask for wrapping a position counter. */ - protected transient long mask; + protected long mask; /** The mask for wrapping a segment counter. */ - protected transient int segmentMask; + protected int segmentMask; /** The mask for wrapping a base counter. */ - protected transient int baseMask; + protected int baseMask; /** Number of entries in the set. */ - protected long size; + protected int size; - /** Initialises the mask values. */ private void initMasks() { mask = n - 1; - /* - * Note that either we have more than one segment, and in this case all segments are - * BigArrays.SEGMENT_SIZE long, or we have exactly one segment whose length is a power of - * two. - */ - segmentMask = key[0].length - 1; baseMask = key.length - 1; + segmentMask = baseMask == 0 ? (int)(n - 1) : BigArrays.SEGMENT_SIZE - 1; } /** - * Creates a new hash big map. + * Creates a new map based on a hash table. * *

* The actual table size will be the least power of two greater than @@ -200,15 +191,14 @@ private void initMasks() { * @param expected the expected number of elements in the map. * @param f the load factor. */ - public Long2IntOpenHashBigMap(final long expected, final float f) { + public ID2NodeMap(final long expected, final float f) { if (f <= 0 || f > 1) throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); if (n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); this.f = f; n = bigArraySize(expected, f); maxFill = maxFill(n, f); - key = LongBigArrays.newBigArray(n); - value = IntBigArrays.newBigArray(n); - used = BooleanBigArrays.newBigArray(n); + key = LongBigArrays.newBigArray(n + 1); + value = IntBigArrays.newBigArray(n + 1); initMasks(); } @@ -217,116 +207,110 @@ public Long2IntOpenHashBigMap(final long expected, final float f) { * and {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. */ - public Long2IntOpenHashBigMap() { + public ID2NodeMap() { this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); } - public int put(final long k, final int v) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int displ = (int)(h & segmentMask); - int base = (int)((h & mask) >>> BigArrays.SEGMENT_SHIFT); - - // There's always an unused entry. - while (used[base][displ]) { - if (k == key[base][displ]) { - final int oldValue = value[base][displ]; - value[base][displ] = v; - return oldValue; - } - base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; - } - - used[base][displ] = true; - key[base][displ] = k; - value[base][displ] = v; + /** + * Returns the node associated with a given identifier, assigning a new one if necessary. + * + * @param id an identifier. + * @return the associated node. + */ + public int getNode(final long id) { + if (id == 0) { + if (containsZeroKey) return BigArrays.get(value, n); + BigArrays.set(value, n, size); + containsZeroKey = true; + } else { - if (++size >= maxFill) rehash(2 * n); - return -1; - } + final long h = it.unimi.dsi.fastutil.HashCommon.mix(id); - public int get(final long k) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + // The starting point. + int displ = (int)(h & segmentMask); + int base = (int)((h & mask) >>> BigArrays.SEGMENT_SHIFT); - // The starting point. - int displ = (int)(h & segmentMask); - int base = (int)((h & mask) >>> BigArrays.SEGMENT_SHIFT); + // There's always an unused entry. + while (key[base][displ] != 0) { + if (id == key[base][displ]) return value[base][displ]; + base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; + } - // There's always an unused entry. - while (used[base][displ]) { - if (k == key[base][displ]) return value[base][displ]; - base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; + key[base][displ] = id; + value[base][displ] = size; } - return -1; + if (++size >= maxFill) rehash(2 * n); + return size - 1; } protected void rehash(final long newN) { - final boolean used[][] = this.used; final long key[][] = this.key; final int[][] value = this.value; - final boolean newUsed[][] = BooleanBigArrays.newBigArray(newN); - final long newKey[][] = LongBigArrays.newBigArray(newN); - final int newValue[][] = IntBigArrays.newBigArray(newN); + final long newKey[][] = LongBigArrays.newBigArray(newN + 1); + final int newValue[][] = IntBigArrays.newBigArray(newN + 1); final long newMask = newN - 1; - final int newSegmentMask = newKey[0].length - 1; final int newBaseMask = newKey.length - 1; + final int newSegmentMask = newBaseMask == 0 ? (int)(newN - 1) : BigArrays.SEGMENT_SIZE - 1; + final int realSize = containsZeroKey ? size - 1 : size; int base = 0, displ = 0; - long h; - long k; - - for (long i = size; i-- != 0;) { - while (!used[base][displ]) + for (int i = realSize; i-- != 0;) { + while (key[base][displ] == 0) base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)); - k = key[base][displ]; - h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + final long k = key[base][displ]; + final long h = it.unimi.dsi.fastutil.HashCommon.mix(k); // The starting point. int d = (int)(h & newSegmentMask); int b = (int)((h & newMask) >>> BigArrays.SEGMENT_SHIFT); - while (newUsed[b][d]) + while (newKey[b][d] != 0) b = (b + ((d = (d + 1) & newSegmentMask) == 0 ? 1 : 0)) & newBaseMask; - newUsed[b][d] = true; newKey[b][d] = k; newValue[b][d] = value[base][displ]; base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)); } + BigArrays.set(newValue, newN, BigArrays.get(value, n)); + this.n = newN; this.key = newKey; this.value = newValue; - this.used = newUsed; initMasks(); maxFill = maxFill(n, f); } /** - * Assuming that the map is a minimal perfect hash, returns the list of keys in value order. + * Returns the id list in order of appearance as an array. * *

* The map is not usable after this call. * * @param tempDir a temporary directory for storing keys and values. - * @return the list of keys in value order. + * @return the id list in order of appearance. */ public long[] getIds(final File tempDir) throws IOException { // Here we assume that the map is a minimal perfect hash + final int realSize = containsZeroKey ? size - 1 : size; int base = 0, displ = 0, b = 0, d = 0; - for(long i = size; i-- != 0;) { - while (! used[base][displ]) base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; + for (int i = realSize; i-- != 0;) { + while (key[base][displ] == 0) base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; key[b][d] = key[base][displ]; value[b][d] = value[base][displ]; base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; b = (b + ((d = (d + 1) & segmentMask) == 0 ? 1 : 0)) & baseMask; } + if (containsZeroKey) { + key[b][d] = 0; + value[b][d] = BigArrays.get(value, n); + } + // The following weird code minimizes memory usage final File keyFile = File.createTempFile(ScatteredArcsASCIIGraph.class.getSimpleName(), "keys", tempDir); keyFile.deleteOnExit(); @@ -336,7 +320,6 @@ public long[] getIds(final File tempDir) throws IOException { BinIO.storeLongs(key, 0, size(), keyFile); BinIO.storeInts(value, 0, size(), valueFile); - used = null; key = null; value = null; @@ -345,7 +328,7 @@ public long[] getIds(final File tempDir) throws IOException { final int[][] value = BinIO.loadIntsBig(valueFile); valueFile.delete(); - final long[] result = new long[(int)size]; + final long[] result = new long[size]; for (int i = (int)size(); i-- != 0;) result[BigArrays.get(value, i)] = BigArrays.get(key, i); return result; } @@ -498,7 +481,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunction function, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); - final Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); + final ID2NodeMap map = new ID2NodeMap(); int numNodes = -1; if (charset == null) charset = Charset.forName("ISO-8859-1"); @@ -556,8 +539,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunction " + s); } @@ -599,8 +581,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunction " + t); } @@ -691,7 +672,7 @@ private final static long getLong(final byte[] array, int offset, int length) { * @param pl a progress logger, or null. */ public ScatteredArcsASCIIGraph(final Iterator arcs, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { - final Long2IntOpenHashBigMap map = new Long2IntOpenHashBigMap(); + final ID2NodeMap map = new ID2NodeMap(); int numNodes = -1; @@ -709,11 +690,9 @@ public ScatteredArcsASCIIGraph(final Iterator arcs, final boolean symmet while(arcs.hasNext()) { final long[] arc = arcs.next(); final long sl = arc[0]; - int s = map.get(sl); - if (s == -1) map.put(sl, s = (int)map.size()); + final int s = map.getNode(sl); final long tl = arc[1]; - int t = map.get(tl); - if (t == -1) map.put(tl, t = (int)map.size()); + final int t = map.getNode(tl); if (s != t || ! noLoops) { source[j] = s; diff --git a/src/it/unimi/dsi/webgraph/Transform.java b/src/it/unimi/dsi/webgraph/Transform.java index ed308a9..c83be08 100644 --- a/src/it/unimi/dsi/webgraph/Transform.java +++ b/src/it/unimi/dsi/webgraph/Transform.java @@ -656,7 +656,7 @@ public static ImmutableGraph map(final ImmutableGraph g, final int map[], final if (! g.randomAccess()) throw new IllegalArgumentException("Graph mapping requires random access"); final int sourceNumNodes = g.numNodes(); - if (map.length != sourceNumNodes) throw new IllegalArgumentException("The graph to be mapped has " + sourceNumNodes + " whereas the map contains " + map.length + " entries"); + if (map.length != sourceNumNodes) throw new IllegalArgumentException("The graph to be mapped has " + sourceNumNodes + " nodes whereas the map contains " + map.length + " entries"); int max = -1; if (pl != null) { diff --git a/test/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraphTest.java b/test/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraphTest.java index 82c39be..61a368a 100644 --- a/test/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraphTest.java +++ b/test/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraphTest.java @@ -32,6 +32,7 @@ import it.unimi.dsi.fastutil.io.FastByteArrayInputStream; import it.unimi.dsi.fastutil.objects.Object2LongArrayMap; import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.webgraph.examples.ErdosRenyiGraph; public class ScatteredArcsASCIIGraphTest extends WebGraphTestCase { @@ -207,4 +208,20 @@ public void testConstructorWithArray() throws IOException { } + @Test + public void testLarge() throws IOException { + final ImmutableGraph erdosRenyiGraph = new ErdosRenyiGraph(100000, .0005, 0, true); + final StringBuilder b = new StringBuilder(); + for (final NodeIterator nodeIterator = erdosRenyiGraph.nodeIterator(); nodeIterator.hasNext();) { + final int curr = nodeIterator.nextInt(); + final LazyIntIterator successors = nodeIterator.successors(); + for (int s; (s = successors.nextInt()) != -1;) b.append(-curr).append('\t').append(-s).append('\n'); + } + + final ScatteredArcsASCIIGraph g = new ScatteredArcsASCIIGraph(new FastByteArrayInputStream(b.toString().getBytes("ASCII")), false, false, 10000, null, null); + final int[] perm = new int[g.numNodes()]; + for (int i = 0; i < g.numNodes(); i++) perm[i] = (int)-g.ids[i]; + + assertEquals(erdosRenyiGraph, Transform.map(new ArrayListMutableGraph(g).immutableView(), perm)); + } } From 4befba4338e26b6518fc69939e34fac2bb8efba5 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Wed, 22 Feb 2023 00:29:19 +0000 Subject: [PATCH 31/85] Better name --- src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java index 961eb90..2847c97 100644 --- a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java @@ -144,7 +144,7 @@ public class ScatteredArcsASCIIGraph extends ImmutableSequentialGraph { /** The list of identifiers in order of appearance. */ public long[] ids; - public static class ID2NodeMap implements Hash { + public static class Id2NodeMap implements Hash { /** The big array of keys. */ protected long[][] key; @@ -191,7 +191,7 @@ private void initMasks() { * @param expected the expected number of elements in the map. * @param f the load factor. */ - public ID2NodeMap(final long expected, final float f) { + public Id2NodeMap(final long expected, final float f) { if (f <= 0 || f > 1) throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); if (n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); this.f = f; @@ -207,7 +207,7 @@ public ID2NodeMap(final long expected, final float f) { * and {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. */ - public ID2NodeMap() { + public Id2NodeMap() { this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); } @@ -481,7 +481,7 @@ public ScatteredArcsASCIIGraph(final InputStream is, final Object2LongFunction function, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); - final ID2NodeMap map = new ID2NodeMap(); + final Id2NodeMap map = new Id2NodeMap(); int numNodes = -1; if (charset == null) charset = Charset.forName("ISO-8859-1"); @@ -672,7 +672,7 @@ private final static long getLong(final byte[] array, int offset, int length) { * @param pl a progress logger, or null. */ public ScatteredArcsASCIIGraph(final Iterator arcs, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { - final ID2NodeMap map = new ID2NodeMap(); + final Id2NodeMap map = new Id2NodeMap(); int numNodes = -1; From 6ae41baa5b33ae82108ff9c9f758ab76049d1356 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Wed, 22 Feb 2023 08:29:16 +0000 Subject: [PATCH 32/85] Better handling of the zero key --- .../dsi/webgraph/ScatteredArcsASCIIGraph.java | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java index 2847c97..55b2427 100644 --- a/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/ScatteredArcsASCIIGraph.java @@ -151,9 +151,12 @@ public static class Id2NodeMap implements Hash { /** The big array of values. */ protected int[][] value; - /** Whether the zero key is present (the value is stored in position {@link #n). */ + /** Whether the zero key is present (the value is stored in {@link #zeroValue). */ protected boolean containsZeroKey; + /** The value associated with the zero key, if {@link #containsZeroKey}. */ + protected int zeroValue; + /** The acceptable load factor. */ protected final float f; @@ -178,7 +181,10 @@ public static class Id2NodeMap implements Hash { private void initMasks() { mask = n - 1; baseMask = key.length - 1; - segmentMask = baseMask == 0 ? (int)(n - 1) : BigArrays.SEGMENT_SIZE - 1; + /* Note that either we have more than one segment, and in this case all segments + * are BigArrays.SEGMENT_SIZE long, or we have exactly one segment whose length + * is a power of two. */ + segmentMask = key[0].length - 1; } /** @@ -197,8 +203,8 @@ public Id2NodeMap(final long expected, final float f) { this.f = f; n = bigArraySize(expected, f); maxFill = maxFill(n, f); - key = LongBigArrays.newBigArray(n + 1); - value = IntBigArrays.newBigArray(n + 1); + key = LongBigArrays.newBigArray(n); + value = IntBigArrays.newBigArray(n); initMasks(); } @@ -219,8 +225,8 @@ public Id2NodeMap() { */ public int getNode(final long id) { if (id == 0) { - if (containsZeroKey) return BigArrays.get(value, n); - BigArrays.set(value, n, size); + if (containsZeroKey) return zeroValue; + zeroValue = size; containsZeroKey = true; } else { @@ -247,11 +253,11 @@ public int getNode(final long id) { protected void rehash(final long newN) { final long key[][] = this.key; final int[][] value = this.value; - final long newKey[][] = LongBigArrays.newBigArray(newN + 1); - final int newValue[][] = IntBigArrays.newBigArray(newN + 1); + final long newKey[][] = LongBigArrays.newBigArray(newN); + final int newValue[][] = IntBigArrays.newBigArray(newN); final long newMask = newN - 1; final int newBaseMask = newKey.length - 1; - final int newSegmentMask = newBaseMask == 0 ? (int)(newN - 1) : BigArrays.SEGMENT_SIZE - 1; + final int newSegmentMask = newKey[0].length - 1; final int realSize = containsZeroKey ? size - 1 : size; int base = 0, displ = 0; @@ -276,8 +282,6 @@ protected void rehash(final long newN) { base = (base + ((displ = (displ + 1) & segmentMask) == 0 ? 1 : 0)); } - BigArrays.set(newValue, newN, BigArrays.get(value, n)); - this.n = newN; this.key = newKey; this.value = newValue; @@ -308,7 +312,7 @@ public long[] getIds(final File tempDir) throws IOException { if (containsZeroKey) { key[b][d] = 0; - value[b][d] = BigArrays.get(value, n); + value[b][d] = zeroValue; } // The following weird code minimizes memory usage From 83b19513a50e8728b6ac94ec1ac1bc3abe8029c8 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Wed, 22 Feb 2023 13:27:06 +0100 Subject: [PATCH 33/85] Added script to process BlockChair files --- bash/blockchair.sh | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100755 bash/blockchair.sh diff --git a/bash/blockchair.sh b/bash/blockchair.sh new file mode 100755 index 0000000..0be7720 --- /dev/null +++ b/bash/blockchair.sh @@ -0,0 +1,40 @@ +#!/bin/bash -e + +if [[ "$2" == "" ]]; then + echo "$(basename $0) DIR NTHREADS [OUTPUT]" 1>&2 + echo "Reads files in DIR and processes them using NTHREADS parallel sorts." 1>&2 + echo "Files are processed as input files unless OUTPUT is specified." 1>&2 + exit 1 +fi + +DIR=$1 +NTHREADS=$2 +OUTPUT=$3 + +FILES=$(mktemp) +find $DIR -type f >$FILES + +NFILES=$(cat $FILES | wc -l) + +if (( NFILES < 2 * NTHREADS )); then + echo "$NTHREADS threads > $NFILES files" 1>&2 + exit 1 +fi + +SPLIT=$(mktemp) +split -n l/$NTHREADS $FILES $SPLIT +SPLITS=$(for file in ${SPLIT}?*; do echo $file; done) + +for file in $SPLITS; do + mkfifo $file.pipe + if [[ "$OUTPUT" != "" ]] ; + (cut -f2,7,10 $(cat $file) | awk '{ if ($3 != 0) print $2 "\t" $1 }' | sort -k2 -S2G >$file.pipe) & + else + (cut -f7,13 $(cat $file) | sort -k2 -S2G >$file.pipe) & + fi +done + +sort -S2G -m $(for file in $SPLITS; do echo $file.pipe; done) + +rm $FILES +rm ${SPLIT}* From 090a952b2f6207bfea8a32e514854ad20518d473 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Wed, 22 Feb 2023 13:30:02 +0100 Subject: [PATCH 34/85] Fixed merge --- bash/blockchair.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bash/blockchair.sh b/bash/blockchair.sh index 0be7720..60141f6 100755 --- a/bash/blockchair.sh +++ b/bash/blockchair.sh @@ -34,7 +34,7 @@ for file in $SPLITS; do fi done -sort -S2G -m $(for file in $SPLITS; do echo $file.pipe; done) +sort -k2 -S2G -m $(for file in $SPLITS; do echo $file.pipe; done) rm $FILES rm ${SPLIT}* From 32f759f4a8bcc3da13fa505fceb0be76445873d8 Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Wed, 22 Feb 2023 14:07:56 +0100 Subject: [PATCH 35/85] Added fix in docs --- bash/blockchair.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bash/blockchair.sh b/bash/blockchair.sh index 60141f6..785edf8 100755 --- a/bash/blockchair.sh +++ b/bash/blockchair.sh @@ -4,6 +4,7 @@ if [[ "$2" == "" ]]; then echo "$(basename $0) DIR NTHREADS [OUTPUT]" 1>&2 echo "Reads files in DIR and processes them using NTHREADS parallel sorts." 1>&2 echo "Files are processed as input files unless OUTPUT is specified." 1>&2 + echo "FILES MUST END WITH A NEWLINE. Fix them with \"sed -i -e '\$a\\' *\"." 1>&2 exit 1 fi @@ -27,10 +28,10 @@ SPLITS=$(for file in ${SPLIT}?*; do echo $file; done) for file in $SPLITS; do mkfifo $file.pipe - if [[ "$OUTPUT" != "" ]] ; - (cut -f2,7,10 $(cat $file) | awk '{ if ($3 != 0) print $2 "\t" $1 }' | sort -k2 -S2G >$file.pipe) & + if [[ "$OUTPUT" != "" ]]; then + (tail -q -n+2 $(cat $file) | cut -f2,7,10 | awk '{ if ($3 != 0) print $2 "\t" $1 }' | sort -k2 -S2G >$file.pipe) & else - (cut -f7,13 $(cat $file) | sort -k2 -S2G >$file.pipe) & + (tail -q -n+2 $(cat $file) | cut -f7,13 | sort -k2 -S2G >$file.pipe) & fi done From 241e7037b3e72707461e540250ee547fce52c22f Mon Sep 17 00:00:00 2001 From: Sebastiano Vigna Date: Wed, 22 Feb 2023 14:28:53 +0100 Subject: [PATCH 36/85] Fixed coinbase test --- bash/blockchair.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bash/blockchair.sh b/bash/blockchair.sh index 785edf8..c941acd 100755 --- a/bash/blockchair.sh +++ b/bash/blockchair.sh @@ -29,7 +29,7 @@ SPLITS=$(for file in ${SPLIT}?*; do echo $file; done) for file in $SPLITS; do mkfifo $file.pipe if [[ "$OUTPUT" != "" ]]; then - (tail -q -n+2 $(cat $file) | cut -f2,7,10 | awk '{ if ($3 != 0) print $2 "\t" $1 }' | sort -k2 -S2G >$file.pipe) & + (tail -q -n+2 $(cat $file) | cut -f2,7,10 | awk '{ if ($3 == 0) print $2 "\t" $1 }' | sort -k2 -S2G >$file.pipe) & else (tail -q -n+2 $(cat $file) | cut -f7,13 | sort -k2 -S2G >$file.pipe) & fi From 8da9b9a378aad99d3218520c71afeb175cfaa8e8 Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Wed, 22 Feb 2023 15:03:43 +0100 Subject: [PATCH 37/85] switched to Id2NodeMap inside constructor --- .../ScatteredLabelledArcsASCIIGraph.java | 82 +++---------------- 1 file changed, 10 insertions(+), 72 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java index 863fd9a..d2d84b6 100644 --- a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java @@ -36,6 +36,7 @@ import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.webgraph.ImmutableSequentialGraph; +import it.unimi.dsi.webgraph.ScatteredArcsASCIIGraph; import it.unimi.dsi.webgraph.Transform; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -299,7 +300,7 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); - ScatteredLabelledArcsASCIIGraph.Long2IntOpenHashBigMap map = new ScatteredLabelledArcsASCIIGraph.Long2IntOpenHashBigMap(); + ScatteredArcsASCIIGraph.Id2NodeMap map = new ScatteredArcsASCIIGraph.Id2NodeMap(); int numNodes = -1; if (charset == null) charset = StandardCharsets.ISO_8859_1; @@ -361,8 +362,7 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu continue; } - s = map.get(sl); - if (s == -1) map.put(sl, s = (int)map.size()); + s = map.getNode(sl); if (DEBUG) System.err.println("Parsed source at line " + line + ": " + sl + " => " + s); } else { @@ -402,8 +402,7 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu continue; } - t = map.get(tl); - if (t == -1) map.put(tl, t = (int)map.size()); + t = map.getNode(tl); if (DEBUG) System.err.println("Parsed target at line " + line + ": " + tl + " => " + t); } else { @@ -495,34 +494,10 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu source = null; target = null; - map.compact(); - - final File keyFile = File.createTempFile(ScatteredLabelledArcsASCIIGraph.class.getSimpleName(), "keys", tempDir); - keyFile.deleteOnExit(); - final File valueFile = File.createTempFile(ScatteredLabelledArcsASCIIGraph.class.getSimpleName(), "values", tempDir); - valueFile.deleteOnExit(); - - BinIO.storeLongs(map.key, 0, map.size(), keyFile); - BinIO.storeInts(map.value, 0, map.size(), valueFile); - - map = null; - - long[][] key = BinIO.loadLongsBig(keyFile); - keyFile.delete(); - int[][] value = BinIO.loadIntsBig(valueFile); - valueFile.delete(); - if (function == null) { - ids = new long[numNodes]; - - final long[] result = new long[numNodes]; - for (int i = numNodes; i-- != 0; ) result[BigArrays.get(value, i)] = BigArrays.get(key, i); - ids = result; + ids = map.getIds(tempDir); } - key = null; - value = null; - this.arcLabelledBatchGraph = new Transform.ArcLabelledBatchGraph(function == null ? numNodes : n, pairs, batches, labelBatches, prototype, labelMergeStrategy); } @@ -540,7 +515,8 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu * @param pl a progress logger, or null. */ public ScatteredLabelledArcsASCIIGraph(final Iterator arcs, final Long2IntFunction function, final Iterator

The actual table size will be the least power of two greater than - * expected/f. - * - * @param expected the expected number of elements in the set. - * @param f the load factor. - */ - public Long2IntOpenHashBigMap(final long expected, final float f) { - if (f <= 0 || f > 1) - throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); - if (this.n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); - this.f = f; - this.n = bigArraySize(expected, f); - this.maxFill = maxFill(this.n, f); - this.key = LongBigArrays.newBigArray(this.n); - this.value = IntBigArrays.newBigArray(this.n); - this.used = BooleanBigArrays.newBigArray(this.n); - this.initMasks(); - } - - /** - * Creates a new hash big set with initial expected {@link Hash#DEFAULT_INITIAL_SIZE} elements and - * {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. - */ - - public Long2IntOpenHashBigMap() { - this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); - } - - /** - * Initialises the mask values. - */ - private void initMasks() { - this.mask = this.n - 1; - /* - * Note that either we have more than one segment, and in this case all segments are - * BigArrays.SEGMENT_SIZE long, or we have exactly one segment whose length is a power of - * two. - */ - this.segmentMask = this.key[0].length - 1; - this.baseMask = this.key.length - 1; - } - - public int put(final long k, final int v) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int displ = (int)(h & this.segmentMask); - int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); - - // There's always an unused entry. - while (this.used[base][displ]) { - if (k == this.key[base][displ]) { - final int oldValue = this.value[base][displ]; - this.value[base][displ] = v; - return oldValue; - } - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - - this.used[base][displ] = true; - this.key[base][displ] = k; - this.value[base][displ] = v; - - if (++this.size >= this.maxFill) this.rehash(2 * this.n); - return -1; - } - - public int get(final long k) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int displ = (int)(h & this.segmentMask); - int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); - - // There's always an unused entry. - while (this.used[base][displ]) { - if (k == this.key[base][displ]) return this.value[base][displ]; - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - - return -1; - } - - private void rehash(final long newN) { - final boolean[][] used = this.used; - final long[][] key = this.key; - final int[][] value = this.value; - final boolean[][] newUsed = BooleanBigArrays.newBigArray(newN); - final long[][] newKey = LongBigArrays.newBigArray(newN); - final int[][] newValue = IntBigArrays.newBigArray(newN); - final long newMask = newN - 1; - final int newSegmentMask = newKey[0].length - 1; - final int newBaseMask = newKey.length - 1; - - int base = 0, displ = 0; - long h; - long k; - - for (long i = this.size; i-- != 0; ) { - - while (!used[base][displ]) base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); - - k = key[base][displ]; - h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int d = (int)(h & newSegmentMask); - int b = (int)((h & newMask) >>> BigArrays.SEGMENT_SHIFT); - - while (newUsed[b][d]) b = (b + ((d = (d + 1) & newSegmentMask) == 0 ? 1 : 0)) & newBaseMask; - - newUsed[b][d] = true; - newKey[b][d] = k; - newValue[b][d] = value[base][displ]; - - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); - } - - this.n = newN; - this.key = newKey; - this.value = newValue; - this.used = newUsed; - this.initMasks(); - this.maxFill = maxFill(this.n, this.f); - } - - public void compact() { - int base = 0, displ = 0, b = 0, d = 0; - for (long i = this.size; i-- != 0; ) { - while (!this.used[base][displ]) - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - this.key[b][d] = this.key[base][displ]; - this.value[b][d] = this.value[base][displ]; - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - b = (b + ((d = (d + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - } - - public long size() { - return this.size; - } - } - - /* @SuppressWarnings("unchecked") + @SuppressWarnings("unchecked") public static void main(final String[] args) throws IllegalArgumentException, SecurityException, IOException, JSAPException, ClassNotFoundException { - String basename; - final SimpleJSAP jsap = new SimpleJSAP(ScatteredLabelledArcsASCIIGraph.class.getName(), "Converts a scattered list of arcs from standard input into a BVGraph. The list of" + - "identifiers in order of appearance will be saved with extension \"" + IDS_EXTENSION + "\", unless a translation function has been specified.", + final SimpleJSAP jsap = new SimpleJSAP(ScatteredLabelledArcsASCIIGraph.class.getName(), + "Converts a scattered list of labelled arcs from standard input into a BVGraph. The list of " + + "identifiers in order of appearance will be saved with extension \"" + IDS_EXTENSION + "\", " + + "unless a translation function has been specified. The labels must be written after each " + + "arc, will be interpreted as integers and stored in gamma coding unless a mapping function " + + "has been specified alongside a label prototype. The underlying representation of the labels " + + "will be saved as the given basename with the \"" + UNDERLYINGGRAPH_SUFFIX + "\" suffix.", new Parameter[]{ new FlaggedOption("logInterval", JSAP.LONG_PARSER, Long.toString(ProgressLogger.DEFAULT_LOG_INTERVAL), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds."), new FlaggedOption("batchSize", JSAP.INTSIZE_PARSER, Integer.toString(DEFAULT_BATCH_SIZE), JSAP.NOT_REQUIRED, 's', "batch-size", "The maximum size of a batch, in arcs."), @@ -911,20 +725,26 @@ public static void main(final String[] args) throws IllegalArgumentException, Se new FlaggedOption("maxRefCount", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_MAX_REF_COUNT), JSAP.NOT_REQUIRED, 'm', "max-ref-count", "Maximum number of backward references (-1 for ∞)."), new FlaggedOption("minIntervalLength", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_MIN_INTERVAL_LENGTH), JSAP.NOT_REQUIRED, 'i', "min-interval-length", "Minimum length of an interval (0 to disable)."), new FlaggedOption("zetaK", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_ZETA_K), JSAP.NOT_REQUIRED, 'k', "zeta-k", "The k parameter for zeta-k codes."), + new FlaggedOption("labelPrototype", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "label-prototype", "The prototype of the labels"), + new FlaggedOption("labelMapping", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'a', "label-mapping", "A serialised function from strings to the given label prototype that will be used to translate label strings to label object."), + new FlaggedOption("labelMergeStrategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'e', "label-merge-strategy", "A serialized LabelMergeStrategy object defining how to tread duplicater arcs with the same label."), new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the output graph"), } ); final JSAPResult jsapResult = jsap.parse(args); - if (jsap.messagePrinted()) System.exit(1); + if (jsap.messagePrinted()) { + System.exit(1); + } - basename = jsapResult.getString("basename"); + String basename = jsapResult.getString("basename"); int flags = 0; - for (final String compressionFlag : jsapResult.getStringArray("comp")) + for(final String compressionFlag: jsapResult.getStringArray("comp")) try { flags |= BVGraph.class.getField(compressionFlag).getInt(BVGraph.class); - } catch (final Exception notFound) { + } + catch (final Exception notFound) { throw new JSAPException("Compression method " + compressionFlag + " unknown."); } @@ -937,27 +757,58 @@ public static void main(final String[] args) throws IllegalArgumentException, Se Object2LongFunction function = null; Charset charset = null; int n = -1; + if (jsapResult.userSpecified("function")) { function = (Object2LongFunction) BinIO.loadObject(jsapResult.getString("function")); charset = Charset.forName(jsapResult.getString("charset")); if (function.size() == -1) { - if (!jsapResult.userSpecified("n")) + if (!jsapResult.userSpecified("n")) { throw new IllegalArgumentException("You must specify a graph size if you specify a translation function that does not return the size of the key set."); + } n = jsapResult.getInt("n"); - } else n = function.size(); + } else { + n = function.size(); + } + } + + if (jsapResult.userSpecified("labelPrototype") != jsapResult.userSpecified("labelMapping")) { + throw new IllegalArgumentException("You must specify either both a label prototype and a label mapping or none."); + } + + Label labelPrototype = new GammaCodedIntLabel("FOO"); + if (jsapResult.userSpecified("labelPrototype")) { + labelPrototype = (Label) BinIO.loadObject(jsapResult.getString("labelPrototype")); + } + + LabelMapping labelMapping = (label, st) -> ((GammaCodedIntLabel)label).value = Integer.parseInt((String) st); + if (jsapResult.userSpecified("labelMapping")) { + labelMapping = (LabelMapping) BinIO.loadObject(jsapResult.getString("labelMapping")); + } + + LabelMergeStrategy labelMergeStrategy = null; + if (jsapResult.userSpecified("labelMergeStrategy")) { + labelMergeStrategy = (LabelMergeStrategy) BinIO.loadObject(jsapResult.getString("labelMergeStrategy")); } File tempDir = null; - if (jsapResult.userSpecified("tempDir")) tempDir = new File(jsapResult.getString("tempDir")); + if (jsapResult.userSpecified("tempDir")) { + tempDir = new File(jsapResult.getString("tempDir")); + } final ProgressLogger pl = new ProgressLogger(LOGGER, jsapResult.getLong("logInterval"), TimeUnit.MILLISECONDS); final boolean zipped = jsapResult.getBoolean("zipped"); final InputStream inStream = (zipped ? new GZIPInputStream(System.in) : System.in); - final ScatteredLabelledArcsASCIIGraph graph = new ScatteredLabelledArcsASCIIGraph(inStream, function, - // TODO: insert default labelMapping e labelFunction - charset, n, jsapResult.userSpecified("symmetrize"), jsapResult.userSpecified("noLoops"), jsapResult.getInt("batchSize"), tempDir, pl); - BVGraph.store(graph, basename, windowSize, maxRefCount, minIntervalLength, zetaK, flags, pl); - if (function == null) BinIO.storeLongs(graph.ids, basename + IDS_EXTENSION); - } */ + + final ScatteredLabelledArcsASCIIGraph graph = new ScatteredLabelledArcsASCIIGraph( + inStream, function, labelPrototype, labelMapping, labelMergeStrategy, + charset, n, jsapResult.userSpecified("symmetrize"), jsapResult.userSpecified("noLoops"), + jsapResult.getInt("batchSize"), tempDir, pl); + BVGraph.storeLabelled(graph.arcLabelledBatchGraph, basename, basename + UNDERLYINGGRAPH_SUFFIX, + windowSize, maxRefCount, minIntervalLength, zetaK, flags, pl); + + if (function == null) { + BinIO.storeLongs(graph.ids, basename + IDS_EXTENSION); + } + } } diff --git a/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java b/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java index 5e05afc..c85ddd0 100644 --- a/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java +++ b/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java @@ -39,7 +39,7 @@ public class ScatteredLabelledArcsASCIIGraphTest extends WebGraphTestCase { private static final Label gammaPrototype = new GammaCodedIntLabel("FOO"); private static final Long2IntFunction identity = Math::toIntExact; private static final LabelMapping hashcodeMapping = (label, st) -> ((GammaCodedIntLabel)label).value = st.hashCode(); - private static final LabelMapping integerMapping = (label, st) -> ((GammaCodedIntLabel)label).value = Integer.parseInt(st); + private static final LabelMapping integerMapping = (label, st) -> ((GammaCodedIntLabel)label).value = Integer.parseInt((String) st); private static Iterator toArcsIterator(final String s) { final String[] arcs = s.split("\n"); From ec7139e029ca13cb2239a1f755953e195d9942ad Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Wed, 22 Feb 2023 17:15:24 +0100 Subject: [PATCH 40/85] reordered parameters in constructor for consistency, added more constructors, added class and variables explanation --- .../ScatteredLabelledArcsASCIIGraph.java | 343 ++++++++++++------ .../ScatteredLabelledArcsASCIIGraphTest.java | 47 ++- 2 files changed, 257 insertions(+), 133 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java index a14cca5..49369a8 100644 --- a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java @@ -19,16 +19,11 @@ import com.martiansoftware.jsap.*; import it.unimi.dsi.Util; -import it.unimi.dsi.fastutil.BigArrays; -import it.unimi.dsi.fastutil.Hash; -import it.unimi.dsi.fastutil.booleans.BooleanBigArrays; import it.unimi.dsi.fastutil.bytes.ByteArrays; -import it.unimi.dsi.fastutil.ints.IntBigArrays; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.io.FastBufferedInputStream; import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream; import it.unimi.dsi.fastutil.longs.Long2IntFunction; -import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.objects.Object2IntFunction; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.objects.ObjectArrayList; @@ -36,10 +31,8 @@ import it.unimi.dsi.io.OutputBitStream; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.logging.ProgressLogger; -import it.unimi.dsi.webgraph.BVGraph; -import it.unimi.dsi.webgraph.ImmutableSequentialGraph; -import it.unimi.dsi.webgraph.ScatteredArcsASCIIGraph; -import it.unimi.dsi.webgraph.Transform; +import it.unimi.dsi.sux4j.mph.GOV3Function; +import it.unimi.dsi.webgraph.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,13 +45,95 @@ import java.util.concurrent.TimeUnit; import java.util.zip.GZIPInputStream; -import static it.unimi.dsi.fastutil.HashCommon.bigArraySize; -import static it.unimi.dsi.fastutil.HashCommon.maxFill; import static it.unimi.dsi.webgraph.Transform.processTransposeBatch; import static it.unimi.dsi.webgraph.labelling.ArcLabelledImmutableGraph.UNDERLYINGGRAPH_SUFFIX; /** - * TODO: write description (adapt the one from ScatteredArcsASCIIGraph) + * An {@link ArcLabelledImmutableGraph} that corresponds to a labelled graph stored as a scattered list of arcs. + * + *

+ * A scattered list of arcs describes a graph in a fairly loose way. Each line contains a + * labelled arc specified as two node identifiers and a label separated by whitespace (but we suggest exactly one TAB + * character). + * + *

+ * In the standard description, node identifiers can be in the range + * [-263..263): they will be remapped in a compact identifier space by + * assigning to each newly seen identifier a new node number. The list of identifiers in order of + * appearance is available in {@link #ids}. Lines can be empty, or comments starting with + * #. Characters following the target will be discarded with a warning. + * Similarly, the labels can be in the range [-263..263) and will be saved + * as-is in gamma coding, in case of duplicates only the last new label will be considered, + * this behaviour can be changed by providing more parameters. + * + *

+ * Warning: Lines not conforming the above specification will cause an error to be + * logged, but will be otherwise ignored. + * + *

+ * Alternatively, you can + * {@linkplain #ScatteredLabelledArcsASCIIGraph(InputStream, Object2LongFunction, Charset, int, boolean) + * provide} an {@link Object2LongFunction Object2LongFunction<String>} with default return value + * -1 that will be used to map identifiers to node numbers, along with a {@link Charset} to parse + * lines and the number of nodes of the graph (which must be a strict upper bound for the largest + * value returned by the function). Note that in principle an {@link Object2IntFunction} would be + * sufficient, but we want to make easier using functions from Sux4J such as {@link GOV3Function}. + * + *

+ * Additionally, the resulting graph can be symmetrized, and its loops be removed, using + * {@linkplain #ScatteredLabelledArcsASCIIGraph(InputStream, boolean, boolean, int, File, ProgressLogger) + * suitable constructor options}. + * + *

+ * You can provide {@linkplain #ScatteredLabelledArcsASCIIGraph(InputStream, labelPrototype, labelMapping, labelMergeStrategy) + * suitable constructor options} a {@link Label} as prototype, a {@link LabelMapping} as a way to + * convert the written labels to object of the prototype's type and a {@link LabelMergeStrategy} + * to handle the case of identical arcs with different labels. + * + *

+ * This class has no load method, and its main method converts a scattered-arcs representation + * directly into a {@link BVGraph}. + * + *

Using {@link ScatteredLabelledArcsASCIIGraph} to convert your data

+ * + *

+ * A simple (albeit rather inefficient) way to import data into WebGraph is using ASCII graphs + * specified by scattered arcs. Suppose you create the following file, named + * example.arcs: + * + *

+ *  # My graph
+ *  -1 15 100
+ *  15 2 200
+ *  2 -1 300 This will cause a warning to be logged
+ *  OOPS! (This will cause an error to be logged)
+ *  -1 2 400
+ * 
+ * + * Then, the command + * + *
+ *  java it.unimi.dsi.webgraph.ScatteredLabelledArcsASCIIGraph example < example.arcs
+ * 
+ * + * will produce a compressed labelled graph in {@link it.unimi.dsi.webgraph.BVGraph} format. + * The underlying graph will be saved with basename example-underlying. + * The file example.ids will contain the list of longs -1, 15, 2. + * The node with identifer -1 will be the node 0 in the output graph, the node with identifier + * 15 will be node 1, and the node with identifier 2 will be node 2. The graph example + * will thus have three nodes and four arcs (viz., <0,1>, <0,2>, <1,2> and + * <2,0>). The labels will be saved as example.labels in the order of visit + * of the arcs, the offset example.labeloffsets relay the offset of each specific label, + * because in general labels are not written in a fixed number of bits. + * + *

Memory requirements

+ * + *

+ * To convert node identifiers to node numbers, instances of this class use a custom map that in the + * worst case will require + * 19.5×2⌈log(4n/3)⌉ ≤ 52n bytes, + * where n is the number of distinct identifiers. Storing batches of arcs in memory + * requires 8 bytes per arc. */ public class ScatteredLabelledArcsASCIIGraph extends ImmutableSequentialGraph { @@ -66,8 +141,18 @@ public class ScatteredLabelledArcsASCIIGraph extends ImmutableSequentialGraph { * The default batch size. */ public static final int DEFAULT_BATCH_SIZE = 1000000; + /** + * The default label prototype. + */ + public static final Label DEFAULT_LABEL_PROTOTYPE = new GammaCodedIntLabel("FOO"); + /** + * The default label mapping function. + */ + public static final LabelMapping DEFAULT_LABEL_MAPPING = (label, st) -> ((GammaCodedIntLabel) label).value = Integer.parseInt((String) st); + private static final Logger LOGGER = LoggerFactory.getLogger(ScatteredLabelledArcsASCIIGraph.class); private final static boolean DEBUG = false; + /** * The extension of the identifier file (a binary list of longs). */ @@ -81,6 +166,15 @@ public class ScatteredLabelledArcsASCIIGraph extends ImmutableSequentialGraph { */ public long[] ids; + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is) throws IOException { + this(is, DEFAULT_LABEL_PROTOTYPE, DEFAULT_LABEL_MAPPING, null, false); + } + /** * Creates a scattered-arcs ASCII graph. * @@ -165,6 +259,22 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPr this(is, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, tempDir, null); } + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + this(is, null, null, -1, DEFAULT_LABEL_PROTOTYPE, DEFAULT_LABEL_MAPPING, null, symmetrize, noLoops, batchSize, tempDir, pl); + } + /** * Creates a scattered-arcs ASCII graph. * @@ -181,128 +291,143 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPr * @param pl a progress logger, or null. */ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { - this(is, null, labelPrototype, labelMapping, labelMergeStrategy, null, -1, symmetrize, noLoops, batchSize, tempDir, pl); + this(is, null, null, -1, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, tempDir, pl); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final Charset charset, final int n) throws IOException { - this(is, function, labelPrototype, labelMapping, labelMergeStrategy, charset, n, false); + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, false); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final boolean symmetrize) throws IOException { + this(is, function, charset, n, DEFAULT_LABEL_PROTOTYPE, DEFAULT_LABEL_MAPPING, null, symmetrize, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. + * @param symmetrize the new graph will be forced to be symmetric. */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final Charset charset, final int n, final boolean symmetrize) throws IOException { - this(is, function, labelPrototype, labelMapping, labelMergeStrategy, charset, n, symmetrize, false); + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, false); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. - * @param noLoops the new graph will have no loops. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops) throws IOException { - this(is, function, labelPrototype, labelMapping, labelMergeStrategy, charset, n, symmetrize, noLoops, DEFAULT_BATCH_SIZE); + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, DEFAULT_BATCH_SIZE); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. - * @param noLoops the new graph will have no loops. - * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by - * this method. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize) throws IOException { - this(is, function, labelPrototype, labelMapping, labelMergeStrategy, charset, n, symmetrize, noLoops, batchSize, null); + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, null); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. - * @param noLoops the new graph will have no loops. - * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by - * this method. - * @param tempDir a temporary directory for the batches, or null for - * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir) throws IOException { - this(is, function, labelPrototype, labelMapping, labelMergeStrategy, charset, n, symmetrize, noLoops, batchSize, tempDir, null); + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, tempDir, null); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. - * @param noLoops the new graph will have no loops. - * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by - * this method. - * @param tempDir a temporary directory for the batches, or null for - * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. - * @param pl a progress logger, or null. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. + * @param pl a progress logger, or null. */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); ScatteredArcsASCIIGraph.Id2NodeMap map = new ScatteredArcsASCIIGraph.Id2NodeMap(); @@ -509,20 +634,20 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu /** * Creates a scattered-arcs ASCII graph. * - * @param arcs an iterator returning the arcs as two-element arrays. - * @param function a function to map the long ids passed in arcs to int nodes. - * @param arcLabels a homogeneous iterator returning the labels in the same order as the arcs. + * @param arcs an iterator returning the arcs as two-element arrays. + * @param function a function to map the long ids passed in arcs to int nodes. + * @param n the number of nodes of the graph (used only if function is not null). + * @param arcLabels a homogeneous iterator returning the labels in the same order as the arcs. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. - * @param noLoops the new graph will have no loops. - * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by - * this method. - * @param tempDir a temporary directory for the batches, or null for - * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. - * @param pl a progress logger, or null. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. + * @param pl a progress logger, or null. */ - public ScatteredLabelledArcsASCIIGraph(final Iterator arcs, final Long2IntFunction function, final Iterator

This method can process {@linkplain ImmutableGraph#loadOffline(CharSequence) offline graphs}. @@ -2618,8 +2625,8 @@ public static void main(final String args[]) throws IOException, IllegalArgument "transposeOffline sourceBasename destBasename [batchSize] [tempDir]\n" + "symmetrize sourceBasename [transposeBasename] destBasename\n" + "symmetrizeOffline sourceBasename destBasename [batchSize] [tempDir]\n" + - "simplifyOffline sourceBasename destBasename [batchSize] [tempDir]\n" + - "simplify sourceBasename transposeBasename destBasename\n" + + "simplifyOffline sourceBasename destBasename [batchSize] [tempDir]\n" + + "simplify sourceBasename transposeBasename destBasename\n" + "union source1Basename source2Basename destBasename [strategy]\n" + "compose source1Basename source2Basename destBasename [semiring]\n" + "gray sourceBasename destBasename\n" + @@ -2646,8 +2653,8 @@ public static void main(final String args[]) throws IOException, IllegalArgument new Switch("ascii", 'a', "ascii", "Maps are in ASCII form (one integer per line)."), new UnflaggedOption("transform", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The transformation to be applied."), new UnflaggedOption("param", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.GREEDY, "The remaining parameters."), - } - ); + } + ); final JSAPResult jsapResult = jsap.parse(args); if (jsap.messagePrinted()) System.exit(1); From 5e3b33d663978ac4f481bd22752c1952836a561c Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Sat, 18 Feb 2023 16:03:40 +0100 Subject: [PATCH 55/85] created ScatteredLabelledArcsASCIIGraph by copying ScatteredArcsASCIIGraph and adapting the constructors (wip) --- .../ScatteredLabelledArcsASCIIGraph.java | 918 ++++++++++++++++++ 1 file changed, 918 insertions(+) create mode 100644 src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java diff --git a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java new file mode 100644 index 0000000..a69eaf7 --- /dev/null +++ b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java @@ -0,0 +1,918 @@ +/* + * Copyright (C) 2011-2023 Sebastiano Vigna + * + * This program and the accompanying materials are made available under the + * terms of the GNU Lesser General Public License v2.1 or later, + * which is available at + * http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html, + * or the Apache Software License 2.0, which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. + * + * SPDX-License-Identifier: LGPL-2.1-or-later OR Apache-2.0 + */ + +package it.unimi.dsi.webgraph.labelling; + +import it.unimi.dsi.Util; +import it.unimi.dsi.fastutil.BigArrays; +import it.unimi.dsi.fastutil.Hash; +import it.unimi.dsi.fastutil.booleans.BooleanBigArrays; +import it.unimi.dsi.fastutil.bytes.ByteArrays; +import it.unimi.dsi.fastutil.ints.IntBigArrays; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.io.FastBufferedInputStream; +import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream; +import it.unimi.dsi.fastutil.longs.LongBigArrays; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; +import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.webgraph.ImmutableSequentialGraph; +import it.unimi.dsi.webgraph.Transform; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Iterator; + +import static it.unimi.dsi.fastutil.HashCommon.bigArraySize; +import static it.unimi.dsi.fastutil.HashCommon.maxFill; +import static it.unimi.dsi.webgraph.Transform.processTransposeBatch; + + +/** + * Da riscrivere + */ + +public class ScatteredLabelledArcsASCIIGraph extends ImmutableSequentialGraph { + private static final Logger LOGGER = LoggerFactory.getLogger(ScatteredLabelledArcsASCIIGraph.class); + + // TODO: rollback to false + private final static boolean DEBUG = true; + + /** + * The default batch size. + */ + public static final int DEFAULT_BATCH_SIZE = 1000000; + /** + * The extension of the identifier file (a binary list of longs). + */ + private static final String IDS_EXTENSION = ".ids"; + /** + * The batch graph used to return node iterators. + */ + private final Transform.ArcLabelledBatchGraph arcLabelledBatchGraph; + /** + * The list of identifiers in order of appearance. + */ + public long[] ids; + + private static final class Long2IntOpenHashBigMap implements java.io.Serializable, Cloneable, Hash { + public static final long serialVersionUID = 0L; + + /** + * The big array of keys. + */ + public transient long[][] key; + + /** + * The big array of values. + */ + public transient int[][] value; + + /** + * The big array telling whether a position is used. + */ + private transient boolean[][] used; + + /** + * The acceptable load factor. + */ + private final float f; + + /** + * The current table size (always a power of 2). + */ + private transient long n; + + /** + * Threshold after which we rehash. It must be the table size times {@link #f}. + */ + private transient long maxFill; + + /** + * The mask for wrapping a position counter. + */ + private transient long mask; + + /** + * The mask for wrapping a segment counter. + */ + private transient int segmentMask; + + /** + * The mask for wrapping a base counter. + */ + private transient int baseMask; + + /** + * Number of entries in the set. + */ + private long size; + + /** + * Initialises the mask values. + */ + private void initMasks() { + this.mask = this.n - 1; + /* + * Note that either we have more than one segment, and in this case all segments are + * BigArrays.SEGMENT_SIZE long, or we have exactly one segment whose length is a power of + * two. + */ + this.segmentMask = this.key[0].length - 1; + this.baseMask = this.key.length - 1; + } + + /** + * Creates a new hash big set. + * + *

The actual table size will be the least power of two greater than + * expected/f. + * + * @param expected the expected number of elements in the set. + * @param f the load factor. + */ + public Long2IntOpenHashBigMap(final long expected, final float f) { + if (f <= 0 || f > 1) + throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); + if (this.n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); + this.f = f; + this.n = bigArraySize(expected, f); + this.maxFill = maxFill(this.n, f); + this.key = LongBigArrays.newBigArray(this.n); + this.value = IntBigArrays.newBigArray(this.n); + this.used = BooleanBigArrays.newBigArray(this.n); + this.initMasks(); + } + + /** + * Creates a new hash big set with initial expected {@link Hash#DEFAULT_INITIAL_SIZE} elements + * and {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. + */ + + public Long2IntOpenHashBigMap() { + this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); + } + + public int put(final long k, final int v) { + final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int displ = (int) (h & this.segmentMask); + int base = (int) ((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); + + // There's always an unused entry. + while (this.used[base][displ]) { + if (k == this.key[base][displ]) { + final int oldValue = this.value[base][displ]; + this.value[base][displ] = v; + return oldValue; + } + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + + this.used[base][displ] = true; + this.key[base][displ] = k; + this.value[base][displ] = v; + + if (++this.size >= this.maxFill) this.rehash(2 * this.n); + return -1; + } + + public int get(final long k) { + final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int displ = (int) (h & this.segmentMask); + int base = (int) ((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); + + // There's always an unused entry. + while (this.used[base][displ]) { + if (k == this.key[base][displ]) return this.value[base][displ]; + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + + return -1; + } + + private void rehash(final long newN) { + final boolean[][] used = this.used; + final long[][] key = this.key; + final int[][] value = this.value; + final boolean[][] newUsed = BooleanBigArrays.newBigArray(newN); + final long[][] newKey = LongBigArrays.newBigArray(newN); + final int[][] newValue = IntBigArrays.newBigArray(newN); + final long newMask = newN - 1; + final int newSegmentMask = newKey[0].length - 1; + final int newBaseMask = newKey.length - 1; + + int base = 0, displ = 0; + long h; + long k; + + for (long i = this.size; i-- != 0; ) { + + while (!used[base][displ]) + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); + + k = key[base][displ]; + h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int d = (int) (h & newSegmentMask); + int b = (int) ((h & newMask) >>> BigArrays.SEGMENT_SHIFT); + + while (newUsed[b][d]) + b = (b + ((d = (d + 1) & newSegmentMask) == 0 ? 1 : 0)) & newBaseMask; + + newUsed[b][d] = true; + newKey[b][d] = k; + newValue[b][d] = value[base][displ]; + + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); + } + + this.n = newN; + this.key = newKey; + this.value = newValue; + this.used = newUsed; + this.initMasks(); + this.maxFill = maxFill(this.n, this.f); + } + + public void compact() { + int base = 0, displ = 0, b = 0, d = 0; + for (long i = this.size; i-- != 0; ) { + while (!this.used[base][displ]) + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + this.key[b][d] = this.key[base][displ]; + this.value[b][d] = this.value[base][displ]; + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + b = (b + ((d = (d + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + } + + public long size() { + return this.size; + } + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping) throws IOException { + this(is, labelPrototype, labelMapping, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final boolean symmetrize) throws IOException { + this(is, labelPrototype, labelMapping, symmetrize, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final boolean symmetrize, final boolean noLoops) throws IOException { + this(is, labelPrototype, labelMapping, symmetrize, noLoops, DEFAULT_BATCH_SIZE); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final boolean symmetrize, final boolean noLoops, final int batchSize) throws IOException { + this(is, labelPrototype, labelMapping, symmetrize, noLoops, batchSize, null); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + * @param tempDir a temporary directory for the batches, or null for {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir) throws IOException { + this(is, labelPrototype, labelMapping, symmetrize, noLoops, batchSize, tempDir, null); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + * @param tempDir a temporary directory for the batches, or null for {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + this(is, null, labelPrototype, labelMapping, null, -1, symmetrize, noLoops, batchSize, tempDir, pl); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final Charset charset, final int n) throws IOException { + this(is, function, labelPrototype, labelMapping, charset, n, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final Charset charset, final int n, final boolean symmetrize) throws IOException { + this(is, function, labelPrototype, labelMapping, charset, n, symmetrize, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops) throws IOException { + this(is, function, labelPrototype, labelMapping, charset, n, symmetrize, noLoops, DEFAULT_BATCH_SIZE); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize) throws IOException { + this(is, function, labelPrototype, labelMapping, charset, n, symmetrize, noLoops, batchSize, null); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + * @param tempDir a temporary directory for the batches, or null for {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir) throws IOException { + this(is, function, labelPrototype, labelMapping, charset, n, symmetrize, noLoops, batchSize, tempDir, null); + } + + + // TODO: Move somewhere else + // Given a label prototype and a value set the value inside the label without creating a new one + // it's like a setter, but for labels. + public interface LabelMapping { + void apply(Label prototype, String representation); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + * @param tempDir a temporary directory for the batches, or null for {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); + ScatteredLabelledArcsASCIIGraph.Long2IntOpenHashBigMap map = new ScatteredLabelledArcsASCIIGraph.Long2IntOpenHashBigMap(); + + int numNodes = -1; + if (charset == null) charset = StandardCharsets.ISO_8859_1; + + int j; + int[] source = new int[batchSize], target = new int[batchSize]; + final long[] labelStart = new long[batchSize]; + FastByteArrayOutputStream fbos = new FastByteArrayOutputStream(); + OutputBitStream obs = new OutputBitStream(fbos); + final ObjectArrayList batches = new ObjectArrayList<>(), + labelBatches = new ObjectArrayList<>(); + final Label prototype = labelPrototype.copy(); + + if (pl != null) { + pl.itemsName = "labelled arcs"; + pl.start("Creating sorted batches..."); + } + + j = 0; + long pairs = 0; // Number of pairs + byte[] array = new byte[1024]; + for (long line = 1; ; line++) { + int start = 0, len; + while ((len = fbis.readLine(array, start, array.length - start, FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) { + start += len; + array = ByteArrays.grow(array, array.length + 1); + } + + if (len == -1) break; // EOF + + final int lineLength = start + len; + + if (DEBUG) + System.err.println("Reading line " + line + "... (" + new String(array, 0, lineLength, charset) + ")"); + + // Skip whitespace at the start of the line. + int offset = 0; + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset == lineLength) { + if (DEBUG) System.err.println("Skipping line " + line + "..."); + continue; // Whitespace line + } + + if (array[0] == '#') continue; + + // Scan source id. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; + + int s; + + if (function == null) { + final long sl; + try { + sl = getLong(array, start, offset - start); + } catch (final RuntimeException e) { + // Discard up to the end of line + LOGGER.error("Error at line " + line + ": " + e.getMessage()); + continue; + } + + s = map.get(sl); + if (s == -1) map.put(sl, s = (int) map.size()); + + if (DEBUG) System.err.println("Parsed source at line " + line + ": " + sl + " => " + s); + } else { + final String ss = new String(array, start, offset - start, charset); + final long sl = function.getLong(ss); + if (sl == -1) { + LOGGER.warn("Unknown source identifier " + ss + " at line " + line); + continue; + } + if (sl < 0 || sl >= n) + throw new IllegalArgumentException("Source node number out of range for node " + ss + ": " + sl); + s = (int) sl; + if (DEBUG) System.err.println("Parsed target at line " + line + ": " + ss + " => " + s); + } + + + // Skip whitespace between identifiers. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset == lineLength) { + LOGGER.error("Error at line " + line + ": no target"); + continue; + } + + // Scan target id. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; + + int t; + + if (function == null) { + final long tl; + try { + tl = getLong(array, start, offset - start); + } catch (final RuntimeException e) { + // Discard up to the end of line + LOGGER.error("Error at line " + line + ": " + e.getMessage()); + continue; + } + + t = map.get(tl); + if (t == -1) map.put(tl, t = (int) map.size()); + + if (DEBUG) System.err.println("Parsed target at line " + line + ": " + tl + " => " + t); + } else { + final String ts = new String(array, start, offset - start, charset); + final long tl = function.getLong(ts); + if (tl == -1) { + LOGGER.warn("Unknown target identifier " + ts + " at line " + line); + continue; + } + + if (tl < 0 || tl >= n) + throw new IllegalArgumentException("Target node number out of range for node " + ts + ": " + tl); + t = (int) tl; + if (DEBUG) System.err.println("Parsed target at line " + line + ": " + ts + " => " + t); + } + + // Skip whitespace between identifiers. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset == lineLength) { + LOGGER.error("Error at line " + line + ": no target"); + continue; + } + + // Scan label. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; + + final String ls = new String(array, start, offset - start, charset); + + // Insert current value into the prototype label. + labelMapping.apply(prototype, ls); + if (DEBUG) System.err.println("Parsed label at line " + line + ": " + ls + " => " + prototype.get()); + + + // Skip whitespace after label. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset < lineLength) LOGGER.warn("Trailing characters ignored at line " + line); + + if (DEBUG) System.err.println("Parsed labelled arc at line " + line + ": " + s + " -> " + t + " (" + prototype.get() + ")"); + + if (s != t || !noLoops) { + source[j] = s; + target[j] = t; + labelStart[j] = obs.writtenBits(); + prototype.toBitStream(obs, s); + j++; + + if (symmetrize && s != t) { + source[j] = t; + target[j] = s; + labelStart[j] = obs.writtenBits(); + prototype.toBitStream(obs, t); + j++; + } + + if (j == batchSize) { + obs.flush(); + pairs += processTransposeBatch(batchSize, source, target, labelStart, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype); + fbos = new FastByteArrayOutputStream(); + obs = new OutputBitStream(fbos); + j = 0; + } + + if (pl != null) pl.lightUpdate(); + } + } + + if (j != 0) { + obs.flush(); + pairs += processTransposeBatch(batchSize, source, target, labelStart, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype); + } + + if (pl != null) { + pl.done(); + logBatches(batches, pairs, pl); + } + + numNodes = function == null ? (int) map.size() : function.size(); + source = null; + target = null; + + map.compact(); + + // Non capisco esattamente come mai salvare le chiavi e i valori della mappa per poi ricaricarli? + // Riguarda il memory management? Chiedere! + // Per ora lascio così com'è, ma se è da fare farei un terzo file e usere BinIO.storeObject per salvare le label. + + final File keyFile = File.createTempFile(ScatteredLabelledArcsASCIIGraph.class.getSimpleName(), "keys", tempDir); + keyFile.deleteOnExit(); + final File valueFile = File.createTempFile(ScatteredLabelledArcsASCIIGraph.class.getSimpleName(), "values", tempDir); + valueFile.deleteOnExit(); + + BinIO.storeLongs(map.key, 0, map.size(), keyFile); + BinIO.storeInts(map.value, 0, map.size(), valueFile); + + map = null; + + long[][] key = BinIO.loadLongsBig(keyFile); + keyFile.delete(); + int[][] value = BinIO.loadIntsBig(valueFile); + valueFile.delete(); + + if (function == null) { + this.ids = new long[numNodes]; + + final long[] result = new long[numNodes]; + for (int i = numNodes; i-- != 0; ) result[BigArrays.get(value, i)] = BigArrays.get(key, i); + this.ids = result; + } + + key = null; + value = null; + + this.arcLabelledBatchGraph = new Transform.ArcLabelledBatchGraph(function == null ? numNodes : n, pairs, batches, labelBatches, prototype); + } + + protected static void logBatches(final ObjectArrayList batches, final long pairs, final ProgressLogger pl) { + long length = 0; + for(final File f : batches) length += f.length(); + pl.logger().info("Created " + batches.size() + " batches using " + Util.format((double)Byte.SIZE * length / pairs) + " bits/arc."); + } + + private final static long getLong(final byte[] array, int offset, int length) { + if (length == 0) throw new NumberFormatException("Empty number"); + int sign = 1; + if (array[offset] == '-') { + sign = -1; + offset++; + length--; + } + + long value = 0; + for (int i = 0; i < length; i++) { + final byte digit = array[offset + i]; + if (digit < '0' || digit > '9') throw new NumberFormatException("Not a digit: " + (char) digit); + value *= 10; + value += digit - '0'; + } + + return sign * value; + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param arcs an iterator returning the arcs as two-element arrays. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by this method. + * @param tempDir a temporary directory for the batches, or null for {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final Iterator arcs, final Iterator

The actual table size will be the least power of two greater than - * expected/f. - * - * @param expected the expected number of elements in the set. - * @param f the load factor. - */ - public Long2IntOpenHashBigMap(final long expected, final float f) { - if (f <= 0 || f > 1) - throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); - if (this.n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); - this.f = f; - this.n = bigArraySize(expected, f); - this.maxFill = maxFill(this.n, f); - this.key = LongBigArrays.newBigArray(this.n); - this.value = IntBigArrays.newBigArray(this.n); - this.used = BooleanBigArrays.newBigArray(this.n); - this.initMasks(); - } - - /** - * Creates a new hash big set with initial expected {@link Hash#DEFAULT_INITIAL_SIZE} elements and - * {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. - */ - - public Long2IntOpenHashBigMap() { - this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); - } - - public int put(final long k, final int v) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int displ = (int)(h & this.segmentMask); - int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); - - // There's always an unused entry. - while (this.used[base][displ]) { - if (k == this.key[base][displ]) { - final int oldValue = this.value[base][displ]; - this.value[base][displ] = v; - return oldValue; - } - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - - this.used[base][displ] = true; - this.key[base][displ] = k; - this.value[base][displ] = v; - - if (++this.size >= this.maxFill) this.rehash(2 * this.n); - return -1; - } - - public int get(final long k) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int displ = (int)(h & this.segmentMask); - int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); - - // There's always an unused entry. - while (this.used[base][displ]) { - if (k == this.key[base][displ]) return this.value[base][displ]; - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - - return -1; - } - - private void rehash(final long newN) { - final boolean[][] used = this.used; - final long[][] key = this.key; - final int[][] value = this.value; - final boolean[][] newUsed = BooleanBigArrays.newBigArray(newN); - final long[][] newKey = LongBigArrays.newBigArray(newN); - final int[][] newValue = IntBigArrays.newBigArray(newN); - final long newMask = newN - 1; - final int newSegmentMask = newKey[0].length - 1; - final int newBaseMask = newKey.length - 1; - - int base = 0, displ = 0; - long h; - long k; - - for (long i = this.size; i-- != 0; ) { - - while (!used[base][displ]) base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); - - k = key[base][displ]; - h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int d = (int)(h & newSegmentMask); - int b = (int)((h & newMask) >>> BigArrays.SEGMENT_SHIFT); - - while (newUsed[b][d]) b = (b + ((d = (d + 1) & newSegmentMask) == 0 ? 1 : 0)) & newBaseMask; - - newUsed[b][d] = true; - newKey[b][d] = k; - newValue[b][d] = value[base][displ]; - - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); - } - - this.n = newN; - this.key = newKey; - this.value = newValue; - this.used = newUsed; - this.initMasks(); - this.maxFill = maxFill(this.n, this.f); - } - - public void compact() { - int base = 0, displ = 0, b = 0, d = 0; - for (long i = this.size; i-- != 0; ) { - while (!this.used[base][displ]) - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - this.key[b][d] = this.key[base][displ]; - this.value[b][d] = this.value[base][displ]; - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - b = (b + ((d = (d + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - } - - public long size() { - return this.size; - } - } - /** * Creates a scattered-arcs ASCII graph. * @@ -430,13 +230,6 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu this(is, function, labelPrototype, labelMapping, charset, n, symmetrize, noLoops, batchSize, tempDir, null); } - // TODO: Move somewhere else - // Given a label prototype and a value set the value inside the label without creating a new one - // it's like a setter, but for labels. - public interface LabelMapping { - void apply(Label prototype, String representation); - } - /** * Creates a scattered-arcs ASCII graph. * @@ -685,32 +478,6 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu this.arcLabelledBatchGraph = new Transform.ArcLabelledBatchGraph(function == null ? numNodes : n, pairs, batches, labelBatches, prototype); } - protected static void logBatches(final ObjectArrayList batches, final long pairs, final ProgressLogger pl) { - long length = 0; - for (final File f : batches) length += f.length(); - pl.logger().info("Created " + batches.size() + " batches using " + Util.format((double)Byte.SIZE * length / pairs) + " bits/arc."); - } - - private final static long getLong(final byte[] array, int offset, int length) { - if (length == 0) throw new NumberFormatException("Empty number"); - int sign = 1; - if (array[offset] == '-') { - sign = -1; - offset++; - length--; - } - - long value = 0; - for (int i = 0; i < length; i++) { - final byte digit = array[offset + i]; - if (digit < '0' || digit > '9') throw new NumberFormatException("Not a digit: " + (char)digit); - value *= 10; - value += digit - '0'; - } - - return sign * value; - } - /** * Creates a scattered-arcs ASCII graph. * @@ -832,6 +599,32 @@ public ScatteredLabelledArcsASCIIGraph(final Iterator arcs, final Iterat this.arcLabelledBatchGraph = new Transform.ArcLabelledBatchGraph(numNodes, pairs, batches, labelBatches, prototype); } + protected static void logBatches(final ObjectArrayList batches, final long pairs, final ProgressLogger pl) { + long length = 0; + for (final File f : batches) length += f.length(); + pl.logger().info("Created " + batches.size() + " batches using " + Util.format((double)Byte.SIZE * length / pairs) + " bits/arc."); + } + + private final static long getLong(final byte[] array, int offset, int length) { + if (length == 0) throw new NumberFormatException("Empty number"); + int sign = 1; + if (array[offset] == '-') { + sign = -1; + offset++; + length--; + } + + long value = 0; + for (int i = 0; i < length; i++) { + final byte digit = array[offset + i]; + if (digit < '0' || digit > '9') throw new NumberFormatException("Not a digit: " + (char)digit); + value *= 10; + value += digit - '0'; + } + + return sign * value; + } + @Override public int numNodes() { if (this.arcLabelledBatchGraph == null) @@ -866,6 +659,225 @@ public ScatteredLabelledArcsASCIIGraph copy() { return this; } + @Override + public String toString() { + final MutableString ms = new MutableString(); + ArcLabelledNodeIterator nodeIterator = nodeIterator(); + ms.append("Nodes: " + numNodes() + "\nArcs: " + numArcs() + "\n"); + while (nodeIterator.hasNext()) { + int node = nodeIterator.nextInt(); + Label[] labels = nodeIterator.labelArray(); + ms.append("Successors of " + node + " (degree " + nodeIterator.outdegree() + "):"); + for (int k = 0; k < nodeIterator.outdegree(); k++) { + ms.append(" " + node + " (" + labels[k].get() + ")"); + } + ms.append("\n"); + + } + return ms.toString(); + } + + // TODO: Move somewhere else + // Given a label prototype and a value set the value inside the label without creating a new one + // it's like a setter, but for labels. + public interface LabelMapping { + void apply(Label prototype, String representation); + } + + private static final class Long2IntOpenHashBigMap implements java.io.Serializable, Cloneable, Hash { + public static final long serialVersionUID = 0L; + /** + * The acceptable load factor. + */ + private final float f; + /** + * The big array of keys. + */ + public transient long[][] key; + /** + * The big array of values. + */ + public transient int[][] value; + /** + * The big array telling whether a position is used. + */ + private transient boolean[][] used; + /** + * The current table size (always a power of 2). + */ + private transient long n; + + /** + * Threshold after which we rehash. It must be the table size times {@link #f}. + */ + private transient long maxFill; + + /** + * The mask for wrapping a position counter. + */ + private transient long mask; + + /** + * The mask for wrapping a segment counter. + */ + private transient int segmentMask; + + /** + * The mask for wrapping a base counter. + */ + private transient int baseMask; + + /** + * Number of entries in the set. + */ + private long size; + + /** + * Creates a new hash big set. + * + *

The actual table size will be the least power of two greater than + * expected/f. + * + * @param expected the expected number of elements in the set. + * @param f the load factor. + */ + public Long2IntOpenHashBigMap(final long expected, final float f) { + if (f <= 0 || f > 1) + throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); + if (this.n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); + this.f = f; + this.n = bigArraySize(expected, f); + this.maxFill = maxFill(this.n, f); + this.key = LongBigArrays.newBigArray(this.n); + this.value = IntBigArrays.newBigArray(this.n); + this.used = BooleanBigArrays.newBigArray(this.n); + this.initMasks(); + } + + /** + * Creates a new hash big set with initial expected {@link Hash#DEFAULT_INITIAL_SIZE} elements and + * {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. + */ + + public Long2IntOpenHashBigMap() { + this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); + } + + /** + * Initialises the mask values. + */ + private void initMasks() { + this.mask = this.n - 1; + /* + * Note that either we have more than one segment, and in this case all segments are + * BigArrays.SEGMENT_SIZE long, or we have exactly one segment whose length is a power of + * two. + */ + this.segmentMask = this.key[0].length - 1; + this.baseMask = this.key.length - 1; + } + + public int put(final long k, final int v) { + final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int displ = (int)(h & this.segmentMask); + int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); + + // There's always an unused entry. + while (this.used[base][displ]) { + if (k == this.key[base][displ]) { + final int oldValue = this.value[base][displ]; + this.value[base][displ] = v; + return oldValue; + } + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + + this.used[base][displ] = true; + this.key[base][displ] = k; + this.value[base][displ] = v; + + if (++this.size >= this.maxFill) this.rehash(2 * this.n); + return -1; + } + + public int get(final long k) { + final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int displ = (int)(h & this.segmentMask); + int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); + + // There's always an unused entry. + while (this.used[base][displ]) { + if (k == this.key[base][displ]) return this.value[base][displ]; + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + + return -1; + } + + private void rehash(final long newN) { + final boolean[][] used = this.used; + final long[][] key = this.key; + final int[][] value = this.value; + final boolean[][] newUsed = BooleanBigArrays.newBigArray(newN); + final long[][] newKey = LongBigArrays.newBigArray(newN); + final int[][] newValue = IntBigArrays.newBigArray(newN); + final long newMask = newN - 1; + final int newSegmentMask = newKey[0].length - 1; + final int newBaseMask = newKey.length - 1; + + int base = 0, displ = 0; + long h; + long k; + + for (long i = this.size; i-- != 0; ) { + + while (!used[base][displ]) base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); + + k = key[base][displ]; + h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); + + // The starting point. + int d = (int)(h & newSegmentMask); + int b = (int)((h & newMask) >>> BigArrays.SEGMENT_SHIFT); + + while (newUsed[b][d]) b = (b + ((d = (d + 1) & newSegmentMask) == 0 ? 1 : 0)) & newBaseMask; + + newUsed[b][d] = true; + newKey[b][d] = k; + newValue[b][d] = value[base][displ]; + + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); + } + + this.n = newN; + this.key = newKey; + this.value = newValue; + this.used = newUsed; + this.initMasks(); + this.maxFill = maxFill(this.n, this.f); + } + + public void compact() { + int base = 0, displ = 0, b = 0, d = 0; + for (long i = this.size; i-- != 0; ) { + while (!this.used[base][displ]) + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + this.key[b][d] = this.key[base][displ]; + this.value[b][d] = this.value[base][displ]; + base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + b = (b + ((d = (d + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; + } + } + + public long size() { + return this.size; + } + } + /* @SuppressWarnings("unchecked") public static void main(final String[] args) throws IllegalArgumentException, SecurityException, IOException, JSAPException, ClassNotFoundException { String basename; From 9f9fceb379c0adba99d1b02e806729efc2c7827d Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Mon, 20 Feb 2023 15:53:54 +0100 Subject: [PATCH 61/85] fixed toString error --- .../webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java index a0353cd..72de9d2 100644 --- a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java @@ -666,10 +666,11 @@ public String toString() { ms.append("Nodes: " + numNodes() + "\nArcs: " + numArcs() + "\n"); while (nodeIterator.hasNext()) { int node = nodeIterator.nextInt(); + ArcLabelledNodeIterator.LabelledArcIterator successors = nodeIterator.successors(); Label[] labels = nodeIterator.labelArray(); ms.append("Successors of " + node + " (degree " + nodeIterator.outdegree() + "):"); for (int k = 0; k < nodeIterator.outdegree(); k++) { - ms.append(" " + node + " (" + labels[k].get() + ")"); + ms.append(" " + successors.nextInt() + " (" + labels[k].get() + ")"); } ms.append("\n"); From bab8f64397b8f57c8a478ae8b394649f71137b31 Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Mon, 20 Feb 2023 16:41:44 +0100 Subject: [PATCH 62/85] changed processTransposeBatch to prune duplicate arcs --- src/it/unimi/dsi/webgraph/Transform.java | 64 +++++++++++++++++------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/Transform.java b/src/it/unimi/dsi/webgraph/Transform.java index c840833..8a6d019 100644 --- a/src/it/unimi/dsi/webgraph/Transform.java +++ b/src/it/unimi/dsi/webgraph/Transform.java @@ -1199,7 +1199,7 @@ public int[] successorArray() { final int numPairs = this.numPairs; // Neither quicksort nor heaps are stable, so we reestablish order here. IntArrays.quickSort(successor, 0, numPairs); - if (numPairs!= 0) { + if (numPairs != 0) { int p = 0; for (int j = 1; j < numPairs; j++) if (successor[p] != successor[j]) successor[++p] = successor[j]; outdegree = p + 1; @@ -1311,6 +1311,8 @@ class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator { private int last; /** The outdegree of the current node (valid if {@link #last} is not -1). */ private int outdegree; + /** The number of pairs associated with the current node (valid if {@link #last} is not -1). */ + private int numPairs; /** The successors of the current node (valid if {@link #last} is not -1); * only the first {@link #outdegree} entries are meaningful. */ private int[] successor; @@ -1319,7 +1321,7 @@ class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator { private Label[] label; public InternalArcLabelledNodeIterator(final int upperBound) throws IOException { - this(upperBound, null, null, null, null, null, -1, 0, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY); + this(upperBound, null, null, null, null, null, -1, -1, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY); } public InternalArcLabelledNodeIterator(final int upperBound, final InputBitStream[] baseIbs, final InputBitStream[] baseLabelInputBitStream, final int[] refArray, final int[] prevTarget, final int[] inputStreamLength, final int last, final int outdegree, final int successor[], final Label[] label) throws IOException { @@ -1377,8 +1379,10 @@ public boolean hasNext() { @Override public int nextInt() { + if (! hasNext()) throw new NoSuchElementException(); last++; int d = 0; + outdegree = -1; int i; try { @@ -1395,8 +1399,8 @@ public int nextInt() { if (--inputStreamLength[i] == 0) { queue.dequeue(); batchIbs[i].close(); - labelInputBitStream[i].close(); batchIbs[i] = null; + labelInputBitStream[i].close(); labelInputBitStream[i] = null; } else { @@ -1410,8 +1414,19 @@ public int nextInt() { } d++; } + + numPairs = d; + } + catch(final IOException e) { + e.printStackTrace(); + throw new RuntimeException(this + " " + e); + } + + // Compute outdegree + if (outdegree == -1) { + final int numPairs = this.numPairs; // Neither quicksort nor heaps are stable, so we reestablish order here. - it.unimi.dsi.fastutil.Arrays.quickSort(0, d, (x, y) -> Integer.compare(successor[x], successor[y]), + it.unimi.dsi.fastutil.Arrays.quickSort(0, numPairs, (x, y) -> Integer.compare(successor[x], successor[y]), (x, y) -> { final int t = successor[x]; successor[x] = successor[y]; @@ -1420,12 +1435,16 @@ public int nextInt() { label[x] = label[y]; label[y] = l; }); - } - catch(final IOException e) { - throw new RuntimeException(e); + + if (numPairs != 0) { + // Avoid returning the duplicate arcs + int p = 0; + for (int j = 1; j < numPairs; j++) if (successor[p] != successor[j]) successor[++p] = successor[j]; + outdegree = p + 1; + } + else outdegree = 0; } - outdegree = d; return last; } @@ -1604,6 +1623,12 @@ public static int processTransposeBatch(final int n, final int[] source, final i batchFile.deleteOnExit(); batches.add(batchFile); final OutputBitStream batch = new OutputBitStream(batchFile); + + final File labelFile = File.createTempFile("label-", ".bits", tempDir); + labelFile.deleteOnExit(); + labelBatches.add(labelFile); + final OutputBitStream labelObs = new OutputBitStream(labelFile); + int u = 0; if (n != 0) { @@ -1616,32 +1641,35 @@ public static int processTransposeBatch(final int n, final int[] source, final i batch.writeDelta(prevSource); batch.writeDelta(target[0]); + labelBitStream.position(start[0]); + prototype.fromBitStream(labelBitStream, source[0]); + prototype.toBitStream(labelObs, target[0]); + for(int i = 1; i < n; i++) { if (source[i] != prevSource) { batch.writeDelta(source[i] - prevSource); batch.writeDelta(target[i]); prevSource = source[i]; + + labelBitStream.position(start[i]); + prototype.fromBitStream(labelBitStream, source[i]); + prototype.toBitStream(labelObs, target[i]); } else if (target[i] != target[i - 1]) { // We don't write duplicate pairs batch.writeDelta(0); batch.writeDelta(target[i] - target[i - 1] - 1); + + labelBitStream.position(start[i]); + prototype.fromBitStream(labelBitStream, source[i]); + prototype.toBitStream(labelObs, target[i]); } } } + else batch.writeDelta(0); batch.close(); - - final File labelFile = File.createTempFile("label-", ".bits", tempDir); - labelFile.deleteOnExit(); - labelBatches.add(labelFile); - final OutputBitStream labelObs = new OutputBitStream(labelFile); - for (int i = 0; i < n; i++) { - labelBitStream.position(start[i]); - prototype.fromBitStream(labelBitStream, source[i]); - prototype.toBitStream(labelObs, target[i]); - } labelObs.close(); return u; From 7f86ce950f6741a0b0e6310ec5b1757d708f8159 Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Mon, 20 Feb 2023 16:43:00 +0100 Subject: [PATCH 63/85] minor refactoring and formatting --- .../ScatteredLabelledArcsASCIIGraphTest.java | 150 ++++++++---------- 1 file changed, 68 insertions(+), 82 deletions(-) diff --git a/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java b/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java index edb2d17..7b892b3 100644 --- a/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java +++ b/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java @@ -26,13 +26,12 @@ import org.junit.Test; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; -import static it.unimi.dsi.webgraph.labelling.ScatteredLabelledArcsASCIIGraph.*; +import static it.unimi.dsi.webgraph.labelling.ScatteredLabelledArcsASCIIGraph.LabelMapping; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; @@ -42,91 +41,97 @@ public class ScatteredLabelledArcsASCIIGraphTest extends WebGraphTestCase { // TODO: label tests + private static Iterator toArcsIterator(final String s) { + final String[] arcs = s.split("\n"); + final List arcSet = new ArrayList<>(); + for (final String arc : arcs) { + final String[] parts = arc.split(" "); + arcSet.add(new long[] {Long.parseLong(parts[0]), Long.parseLong(parts[1])}); + } + return arcSet.iterator(); + } + + private static Iterator

The actual table size will be the least power of two greater than - * expected/f. - * - * @param expected the expected number of elements in the set. - * @param f the load factor. - */ - public Long2IntOpenHashBigMap(final long expected, final float f) { - if (f <= 0 || f > 1) - throw new IllegalArgumentException("Load factor must be greater than 0 and smaller than or equal to 1"); - if (this.n < 0) throw new IllegalArgumentException("The expected number of elements must be nonnegative"); - this.f = f; - this.n = bigArraySize(expected, f); - this.maxFill = maxFill(this.n, f); - this.key = LongBigArrays.newBigArray(this.n); - this.value = IntBigArrays.newBigArray(this.n); - this.used = BooleanBigArrays.newBigArray(this.n); - this.initMasks(); - } - - /** - * Creates a new hash big set with initial expected {@link Hash#DEFAULT_INITIAL_SIZE} elements and - * {@link Hash#DEFAULT_LOAD_FACTOR} as load factor. - */ - - public Long2IntOpenHashBigMap() { - this(DEFAULT_INITIAL_SIZE, DEFAULT_LOAD_FACTOR); - } - - /** - * Initialises the mask values. - */ - private void initMasks() { - this.mask = this.n - 1; - /* - * Note that either we have more than one segment, and in this case all segments are - * BigArrays.SEGMENT_SIZE long, or we have exactly one segment whose length is a power of - * two. - */ - this.segmentMask = this.key[0].length - 1; - this.baseMask = this.key.length - 1; - } - - public int put(final long k, final int v) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int displ = (int)(h & this.segmentMask); - int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); - - // There's always an unused entry. - while (this.used[base][displ]) { - if (k == this.key[base][displ]) { - final int oldValue = this.value[base][displ]; - this.value[base][displ] = v; - return oldValue; - } - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - - this.used[base][displ] = true; - this.key[base][displ] = k; - this.value[base][displ] = v; - - if (++this.size >= this.maxFill) this.rehash(2 * this.n); - return -1; - } - - public int get(final long k) { - final long h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int displ = (int)(h & this.segmentMask); - int base = (int)((h & this.mask) >>> BigArrays.SEGMENT_SHIFT); - - // There's always an unused entry. - while (this.used[base][displ]) { - if (k == this.key[base][displ]) return this.value[base][displ]; - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - - return -1; - } - - private void rehash(final long newN) { - final boolean[][] used = this.used; - final long[][] key = this.key; - final int[][] value = this.value; - final boolean[][] newUsed = BooleanBigArrays.newBigArray(newN); - final long[][] newKey = LongBigArrays.newBigArray(newN); - final int[][] newValue = IntBigArrays.newBigArray(newN); - final long newMask = newN - 1; - final int newSegmentMask = newKey[0].length - 1; - final int newBaseMask = newKey.length - 1; - - int base = 0, displ = 0; - long h; - long k; - - for (long i = this.size; i-- != 0; ) { - - while (!used[base][displ]) base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); - - k = key[base][displ]; - h = it.unimi.dsi.fastutil.HashCommon.murmurHash3(k); - - // The starting point. - int d = (int)(h & newSegmentMask); - int b = (int)((h & newMask) >>> BigArrays.SEGMENT_SHIFT); - - while (newUsed[b][d]) b = (b + ((d = (d + 1) & newSegmentMask) == 0 ? 1 : 0)) & newBaseMask; - - newUsed[b][d] = true; - newKey[b][d] = k; - newValue[b][d] = value[base][displ]; - - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)); - } - - this.n = newN; - this.key = newKey; - this.value = newValue; - this.used = newUsed; - this.initMasks(); - this.maxFill = maxFill(this.n, this.f); - } - - public void compact() { - int base = 0, displ = 0, b = 0, d = 0; - for (long i = this.size; i-- != 0; ) { - while (!this.used[base][displ]) - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - this.key[b][d] = this.key[base][displ]; - this.value[b][d] = this.value[base][displ]; - base = (base + ((displ = (displ + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - b = (b + ((d = (d + 1) & this.segmentMask) == 0 ? 1 : 0)) & this.baseMask; - } - } - - public long size() { - return this.size; - } - } - - /* @SuppressWarnings("unchecked") + @SuppressWarnings("unchecked") public static void main(final String[] args) throws IllegalArgumentException, SecurityException, IOException, JSAPException, ClassNotFoundException { - String basename; - final SimpleJSAP jsap = new SimpleJSAP(ScatteredLabelledArcsASCIIGraph.class.getName(), "Converts a scattered list of arcs from standard input into a BVGraph. The list of" + - "identifiers in order of appearance will be saved with extension \"" + IDS_EXTENSION + "\", unless a translation function has been specified.", + final SimpleJSAP jsap = new SimpleJSAP(ScatteredLabelledArcsASCIIGraph.class.getName(), + "Converts a scattered list of labelled arcs from standard input into a BVGraph. The list of " + + "identifiers in order of appearance will be saved with extension \"" + IDS_EXTENSION + "\", " + + "unless a translation function has been specified. The labels must be written after each " + + "arc, will be interpreted as integers and stored in gamma coding unless a mapping function " + + "has been specified alongside a label prototype. The underlying representation of the labels " + + "will be saved as the given basename with the \"" + UNDERLYINGGRAPH_SUFFIX + "\" suffix.", new Parameter[]{ new FlaggedOption("logInterval", JSAP.LONG_PARSER, Long.toString(ProgressLogger.DEFAULT_LOG_INTERVAL), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds."), new FlaggedOption("batchSize", JSAP.INTSIZE_PARSER, Integer.toString(DEFAULT_BATCH_SIZE), JSAP.NOT_REQUIRED, 's', "batch-size", "The maximum size of a batch, in arcs."), @@ -911,20 +725,26 @@ public static void main(final String[] args) throws IllegalArgumentException, Se new FlaggedOption("maxRefCount", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_MAX_REF_COUNT), JSAP.NOT_REQUIRED, 'm', "max-ref-count", "Maximum number of backward references (-1 for ∞)."), new FlaggedOption("minIntervalLength", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_MIN_INTERVAL_LENGTH), JSAP.NOT_REQUIRED, 'i', "min-interval-length", "Minimum length of an interval (0 to disable)."), new FlaggedOption("zetaK", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_ZETA_K), JSAP.NOT_REQUIRED, 'k', "zeta-k", "The k parameter for zeta-k codes."), + new FlaggedOption("labelPrototype", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "label-prototype", "The prototype of the labels"), + new FlaggedOption("labelMapping", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'a', "label-mapping", "A serialised function from strings to the given label prototype that will be used to translate label strings to label object."), + new FlaggedOption("labelMergeStrategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'e', "label-merge-strategy", "A serialized LabelMergeStrategy object defining how to tread duplicater arcs with the same label."), new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the output graph"), } ); final JSAPResult jsapResult = jsap.parse(args); - if (jsap.messagePrinted()) System.exit(1); + if (jsap.messagePrinted()) { + System.exit(1); + } - basename = jsapResult.getString("basename"); + String basename = jsapResult.getString("basename"); int flags = 0; - for (final String compressionFlag : jsapResult.getStringArray("comp")) + for(final String compressionFlag: jsapResult.getStringArray("comp")) try { flags |= BVGraph.class.getField(compressionFlag).getInt(BVGraph.class); - } catch (final Exception notFound) { + } + catch (final Exception notFound) { throw new JSAPException("Compression method " + compressionFlag + " unknown."); } @@ -937,27 +757,58 @@ public static void main(final String[] args) throws IllegalArgumentException, Se Object2LongFunction function = null; Charset charset = null; int n = -1; + if (jsapResult.userSpecified("function")) { function = (Object2LongFunction) BinIO.loadObject(jsapResult.getString("function")); charset = Charset.forName(jsapResult.getString("charset")); if (function.size() == -1) { - if (!jsapResult.userSpecified("n")) + if (!jsapResult.userSpecified("n")) { throw new IllegalArgumentException("You must specify a graph size if you specify a translation function that does not return the size of the key set."); + } n = jsapResult.getInt("n"); - } else n = function.size(); + } else { + n = function.size(); + } + } + + if (jsapResult.userSpecified("labelPrototype") != jsapResult.userSpecified("labelMapping")) { + throw new IllegalArgumentException("You must specify either both a label prototype and a label mapping or none."); + } + + Label labelPrototype = new GammaCodedIntLabel("FOO"); + if (jsapResult.userSpecified("labelPrototype")) { + labelPrototype = (Label) BinIO.loadObject(jsapResult.getString("labelPrototype")); + } + + LabelMapping labelMapping = (label, st) -> ((GammaCodedIntLabel)label).value = Integer.parseInt((String) st); + if (jsapResult.userSpecified("labelMapping")) { + labelMapping = (LabelMapping) BinIO.loadObject(jsapResult.getString("labelMapping")); + } + + LabelMergeStrategy labelMergeStrategy = null; + if (jsapResult.userSpecified("labelMergeStrategy")) { + labelMergeStrategy = (LabelMergeStrategy) BinIO.loadObject(jsapResult.getString("labelMergeStrategy")); } File tempDir = null; - if (jsapResult.userSpecified("tempDir")) tempDir = new File(jsapResult.getString("tempDir")); + if (jsapResult.userSpecified("tempDir")) { + tempDir = new File(jsapResult.getString("tempDir")); + } final ProgressLogger pl = new ProgressLogger(LOGGER, jsapResult.getLong("logInterval"), TimeUnit.MILLISECONDS); final boolean zipped = jsapResult.getBoolean("zipped"); final InputStream inStream = (zipped ? new GZIPInputStream(System.in) : System.in); - final ScatteredLabelledArcsASCIIGraph graph = new ScatteredLabelledArcsASCIIGraph(inStream, function, - // TODO: insert default labelMapping e labelFunction - charset, n, jsapResult.userSpecified("symmetrize"), jsapResult.userSpecified("noLoops"), jsapResult.getInt("batchSize"), tempDir, pl); - BVGraph.store(graph, basename, windowSize, maxRefCount, minIntervalLength, zetaK, flags, pl); - if (function == null) BinIO.storeLongs(graph.ids, basename + IDS_EXTENSION); - } */ + + final ScatteredLabelledArcsASCIIGraph graph = new ScatteredLabelledArcsASCIIGraph( + inStream, function, labelPrototype, labelMapping, labelMergeStrategy, + charset, n, jsapResult.userSpecified("symmetrize"), jsapResult.userSpecified("noLoops"), + jsapResult.getInt("batchSize"), tempDir, pl); + BVGraph.storeLabelled(graph.arcLabelledBatchGraph, basename, basename + UNDERLYINGGRAPH_SUFFIX, + windowSize, maxRefCount, minIntervalLength, zetaK, flags, pl); + + if (function == null) { + BinIO.storeLongs(graph.ids, basename + IDS_EXTENSION); + } + } } diff --git a/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java b/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java index 5e05afc..c85ddd0 100644 --- a/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java +++ b/test/it/unimi/dsi/webgraph/ScatteredLabelledArcsASCIIGraphTest.java @@ -39,7 +39,7 @@ public class ScatteredLabelledArcsASCIIGraphTest extends WebGraphTestCase { private static final Label gammaPrototype = new GammaCodedIntLabel("FOO"); private static final Long2IntFunction identity = Math::toIntExact; private static final LabelMapping hashcodeMapping = (label, st) -> ((GammaCodedIntLabel)label).value = st.hashCode(); - private static final LabelMapping integerMapping = (label, st) -> ((GammaCodedIntLabel)label).value = Integer.parseInt(st); + private static final LabelMapping integerMapping = (label, st) -> ((GammaCodedIntLabel)label).value = Integer.parseInt((String) st); private static Iterator toArcsIterator(final String s) { final String[] arcs = s.split("\n"); From a8f8b0f60866005877cc857b5cdfef77d25149f9 Mon Sep 17 00:00:00 2001 From: Luigi Foscari Date: Wed, 22 Feb 2023 17:15:24 +0100 Subject: [PATCH 75/85] reordered parameters in constructor for consistency, added more constructors, added class and variables explanation --- .../ScatteredLabelledArcsASCIIGraph.java | 343 ++++++++++++------ .../ScatteredLabelledArcsASCIIGraphTest.java | 47 ++- 2 files changed, 257 insertions(+), 133 deletions(-) diff --git a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java index a14cca5..49369a8 100644 --- a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java +++ b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java @@ -19,16 +19,11 @@ import com.martiansoftware.jsap.*; import it.unimi.dsi.Util; -import it.unimi.dsi.fastutil.BigArrays; -import it.unimi.dsi.fastutil.Hash; -import it.unimi.dsi.fastutil.booleans.BooleanBigArrays; import it.unimi.dsi.fastutil.bytes.ByteArrays; -import it.unimi.dsi.fastutil.ints.IntBigArrays; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.io.FastBufferedInputStream; import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream; import it.unimi.dsi.fastutil.longs.Long2IntFunction; -import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.objects.Object2IntFunction; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.objects.ObjectArrayList; @@ -36,10 +31,8 @@ import it.unimi.dsi.io.OutputBitStream; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.logging.ProgressLogger; -import it.unimi.dsi.webgraph.BVGraph; -import it.unimi.dsi.webgraph.ImmutableSequentialGraph; -import it.unimi.dsi.webgraph.ScatteredArcsASCIIGraph; -import it.unimi.dsi.webgraph.Transform; +import it.unimi.dsi.sux4j.mph.GOV3Function; +import it.unimi.dsi.webgraph.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,13 +45,95 @@ import java.util.concurrent.TimeUnit; import java.util.zip.GZIPInputStream; -import static it.unimi.dsi.fastutil.HashCommon.bigArraySize; -import static it.unimi.dsi.fastutil.HashCommon.maxFill; import static it.unimi.dsi.webgraph.Transform.processTransposeBatch; import static it.unimi.dsi.webgraph.labelling.ArcLabelledImmutableGraph.UNDERLYINGGRAPH_SUFFIX; /** - * TODO: write description (adapt the one from ScatteredArcsASCIIGraph) + * An {@link ArcLabelledImmutableGraph} that corresponds to a labelled graph stored as a scattered list of arcs. + * + *

+ * A scattered list of arcs describes a graph in a fairly loose way. Each line contains a + * labelled arc specified as two node identifiers and a label separated by whitespace (but we suggest exactly one TAB + * character). + * + *

+ * In the standard description, node identifiers can be in the range + * [-263..263): they will be remapped in a compact identifier space by + * assigning to each newly seen identifier a new node number. The list of identifiers in order of + * appearance is available in {@link #ids}. Lines can be empty, or comments starting with + * #. Characters following the target will be discarded with a warning. + * Similarly, the labels can be in the range [-263..263) and will be saved + * as-is in gamma coding, in case of duplicates only the last new label will be considered, + * this behaviour can be changed by providing more parameters. + * + *

+ * Warning: Lines not conforming the above specification will cause an error to be + * logged, but will be otherwise ignored. + * + *

+ * Alternatively, you can + * {@linkplain #ScatteredLabelledArcsASCIIGraph(InputStream, Object2LongFunction, Charset, int, boolean) + * provide} an {@link Object2LongFunction Object2LongFunction<String>} with default return value + * -1 that will be used to map identifiers to node numbers, along with a {@link Charset} to parse + * lines and the number of nodes of the graph (which must be a strict upper bound for the largest + * value returned by the function). Note that in principle an {@link Object2IntFunction} would be + * sufficient, but we want to make easier using functions from Sux4J such as {@link GOV3Function}. + * + *

+ * Additionally, the resulting graph can be symmetrized, and its loops be removed, using + * {@linkplain #ScatteredLabelledArcsASCIIGraph(InputStream, boolean, boolean, int, File, ProgressLogger) + * suitable constructor options}. + * + *

+ * You can provide {@linkplain #ScatteredLabelledArcsASCIIGraph(InputStream, labelPrototype, labelMapping, labelMergeStrategy) + * suitable constructor options} a {@link Label} as prototype, a {@link LabelMapping} as a way to + * convert the written labels to object of the prototype's type and a {@link LabelMergeStrategy} + * to handle the case of identical arcs with different labels. + * + *

+ * This class has no load method, and its main method converts a scattered-arcs representation + * directly into a {@link BVGraph}. + * + *

Using {@link ScatteredLabelledArcsASCIIGraph} to convert your data

+ * + *

+ * A simple (albeit rather inefficient) way to import data into WebGraph is using ASCII graphs + * specified by scattered arcs. Suppose you create the following file, named + * example.arcs: + * + *

+ *  # My graph
+ *  -1 15 100
+ *  15 2 200
+ *  2 -1 300 This will cause a warning to be logged
+ *  OOPS! (This will cause an error to be logged)
+ *  -1 2 400
+ * 
+ * + * Then, the command + * + *
+ *  java it.unimi.dsi.webgraph.ScatteredLabelledArcsASCIIGraph example < example.arcs
+ * 
+ * + * will produce a compressed labelled graph in {@link it.unimi.dsi.webgraph.BVGraph} format. + * The underlying graph will be saved with basename example-underlying. + * The file example.ids will contain the list of longs -1, 15, 2. + * The node with identifer -1 will be the node 0 in the output graph, the node with identifier + * 15 will be node 1, and the node with identifier 2 will be node 2. The graph example + * will thus have three nodes and four arcs (viz., <0,1>, <0,2>, <1,2> and + * <2,0>). The labels will be saved as example.labels in the order of visit + * of the arcs, the offset example.labeloffsets relay the offset of each specific label, + * because in general labels are not written in a fixed number of bits. + * + *

Memory requirements

+ * + *

+ * To convert node identifiers to node numbers, instances of this class use a custom map that in the + * worst case will require + * 19.5×2⌈log(4n/3)⌉ ≤ 52n bytes, + * where n is the number of distinct identifiers. Storing batches of arcs in memory + * requires 8 bytes per arc. */ public class ScatteredLabelledArcsASCIIGraph extends ImmutableSequentialGraph { @@ -66,8 +141,18 @@ public class ScatteredLabelledArcsASCIIGraph extends ImmutableSequentialGraph { * The default batch size. */ public static final int DEFAULT_BATCH_SIZE = 1000000; + /** + * The default label prototype. + */ + public static final Label DEFAULT_LABEL_PROTOTYPE = new GammaCodedIntLabel("FOO"); + /** + * The default label mapping function. + */ + public static final LabelMapping DEFAULT_LABEL_MAPPING = (label, st) -> ((GammaCodedIntLabel) label).value = Integer.parseInt((String) st); + private static final Logger LOGGER = LoggerFactory.getLogger(ScatteredLabelledArcsASCIIGraph.class); private final static boolean DEBUG = false; + /** * The extension of the identifier file (a binary list of longs). */ @@ -81,6 +166,15 @@ public class ScatteredLabelledArcsASCIIGraph extends ImmutableSequentialGraph { */ public long[] ids; + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is) throws IOException { + this(is, DEFAULT_LABEL_PROTOTYPE, DEFAULT_LABEL_MAPPING, null, false); + } + /** * Creates a scattered-arcs ASCII graph. * @@ -165,6 +259,22 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPr this(is, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, tempDir, null); } + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + this(is, null, null, -1, DEFAULT_LABEL_PROTOTYPE, DEFAULT_LABEL_MAPPING, null, symmetrize, noLoops, batchSize, tempDir, pl); + } + /** * Creates a scattered-arcs ASCII graph. * @@ -181,128 +291,143 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPr * @param pl a progress logger, or null. */ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { - this(is, null, labelPrototype, labelMapping, labelMergeStrategy, null, -1, symmetrize, noLoops, batchSize, tempDir, pl); + this(is, null, null, -1, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, tempDir, pl); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final Charset charset, final int n) throws IOException { - this(is, function, labelPrototype, labelMapping, labelMergeStrategy, charset, n, false); + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, false); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final boolean symmetrize) throws IOException { + this(is, function, charset, n, DEFAULT_LABEL_PROTOTYPE, DEFAULT_LABEL_MAPPING, null, symmetrize, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. + * @param symmetrize the new graph will be forced to be symmetric. */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final Charset charset, final int n, final boolean symmetrize) throws IOException { - this(is, function, labelPrototype, labelMapping, labelMergeStrategy, charset, n, symmetrize, false); + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, false); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. - * @param noLoops the new graph will have no loops. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops) throws IOException { - this(is, function, labelPrototype, labelMapping, labelMergeStrategy, charset, n, symmetrize, noLoops, DEFAULT_BATCH_SIZE); + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, DEFAULT_BATCH_SIZE); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. - * @param noLoops the new graph will have no loops. - * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by - * this method. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize) throws IOException { - this(is, function, labelPrototype, labelMapping, labelMergeStrategy, charset, n, symmetrize, noLoops, batchSize, null); + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, null); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. - * @param noLoops the new graph will have no loops. - * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by - * this method. - * @param tempDir a temporary directory for the batches, or null for - * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir) throws IOException { - this(is, function, labelPrototype, labelMapping, labelMergeStrategy, charset, n, symmetrize, noLoops, batchSize, tempDir, null); + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, tempDir, null); } /** * Creates a scattered-arcs ASCII graph. * - * @param is an input stream containing a scattered list of arcs. - * @param function an explicitly provided function from string representing nodes to node numbers, or - * null for the standard behaviour. - * @param labelPrototype an example of the labels contained in the graph. - * @param labelMapping a function mapping string into the label defined by the prototype. + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param charset a character set that will be used to read the identifiers passed to function, or - * null for ISO-8859-1 (used only if function is not null). - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. - * @param noLoops the new graph will have no loops. - * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by - * this method. - * @param tempDir a temporary directory for the batches, or null for - * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. - * @param pl a progress logger, or null. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. + * @param pl a progress logger, or null. */ - public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, Charset charset, final int n, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { @SuppressWarnings("resource") final FastBufferedInputStream fbis = new FastBufferedInputStream(is); ScatteredArcsASCIIGraph.Id2NodeMap map = new ScatteredArcsASCIIGraph.Id2NodeMap(); @@ -509,20 +634,20 @@ public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFu /** * Creates a scattered-arcs ASCII graph. * - * @param arcs an iterator returning the arcs as two-element arrays. - * @param function a function to map the long ids passed in arcs to int nodes. - * @param arcLabels a homogeneous iterator returning the labels in the same order as the arcs. + * @param arcs an iterator returning the arcs as two-element arrays. + * @param function a function to map the long ids passed in arcs to int nodes. + * @param n the number of nodes of the graph (used only if function is not null). + * @param arcLabels a homogeneous iterator returning the labels in the same order as the arcs. * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. - * @param n the number of nodes of the graph (used only if function is not null). - * @param symmetrize the new graph will be forced to be symmetric. - * @param noLoops the new graph will have no loops. - * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by - * this method. - * @param tempDir a temporary directory for the batches, or null for - * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. - * @param pl a progress logger, or null. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. + * @param pl a progress logger, or null. */ - public ScatteredLabelledArcsASCIIGraph(final Iterator arcs, final Long2IntFunction function, final Iterator