From ae0fa98a63fa4fe81e6b25d80c07560d1d330d76 Mon Sep 17 00:00:00 2001
From: Tobias Wrigstad <tobias.wrigstad@it.uu.se>
Date: Wed, 22 Apr 2020 22:50:00 +0200
Subject: [PATCH 1/3] Optimised clone list creation

---
 Programs/src/notebooks/SccOutputAnalyzer.java | 93 ++++++++++++++++++-
 Programs/src/notebooks/SccSnippetId.java      |  4 +
 2 files changed, 93 insertions(+), 4 deletions(-)
diff --git a/Programs/src/notebooks/SccOutputAnalyzer.java b/Programs/src/notebooks/SccOutputAnalyzer.java
index 2d56bf1..0f479fd 100644
--- a/Programs/src/notebooks/SccOutputAnalyzer.java
+++ b/Programs/src/notebooks/SccOutputAnalyzer.java
@@ -5,6 +5,7 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -132,7 +133,22 @@ private void addOrIncrease(Map<String, Integer> map, String key) {
 	 * files from SourcererCC.
 	 */
 	private Map<SnippetCode, List<Snippet>> getClones(String pairFile) throws IOException {
-		List<List<SccSnippetId>> clones = getCloneLists(pairFile);
+		// List<List<SccSnippetId>> clones = getCloneLists(pairFile);
+		// List<List<SccSnippetId>> clones2 = getCloneListsAlt(pairFile);
+
+    // HashSet<HashSet<SccSnippetId>> a = new HashSet<HashSet<SccSnippetId>>();
+    // HashSet<HashSet<SccSnippetId>> b = new HashSet<HashSet<SccSnippetId>>();
+    
+    // for (List<SccSnippetId> il : clones) {
+    //     a.add(new HashSet<SccSnippetId>(il));
+    // }
+    // for (List<SccSnippetId> il : clones2) {
+    //     b.add(new HashSet<SccSnippetId>(il));
+    // }
+
+    // System.err.println("ALTERNATIVE " + a.equals(b));
+    
+		List<List<SccSnippetId>> clones = getCloneListsAlt(pairFile);
 		return getCloneMap(clones);
 	}
 	
@@ -140,6 +156,7 @@ private List<List<SccSnippetId>> getCloneLists(String pairFile) throws FileNotFo
 		List<List<SccSnippetId>> clones = new ArrayList<List<SccSnippetId>>();
 		Scanner scanner = new Scanner(new File(pairFile));
 		int numRead = 0;
+    long t1 = System.currentTimeMillis();
 		while (scanner.hasNextLine()) {
 			String line = scanner.nextLine();
 			assert(line.matches("[0-9]+,[0-9]+,[0-9]+,[0-9]+"));
@@ -169,15 +186,83 @@ private List<List<SccSnippetId>> getCloneLists(String pairFile) throws FileNotFo
 				clones.add(newCloneList);
 			}
 			numRead++;
-			if (0 == numRead%1000000) {
-				System.out.println(numRead + " clone pairs read.");
+			if (0 == numRead%10000) {
+          long t2 = System.currentTimeMillis();
+          System.out.println(numRead + " clone pairs read in " + (t2 - t1) + " ms");
+          t1 = t2;
 			}
 		}
 		scanner.close();
 		return clones;
 	}
+
+    private List<List<SccSnippetId>> getCloneListsAlt(String pairFile) throws FileNotFoundException {
+        Scanner scanner = new Scanner(new File(pairFile));
+        int numRead = 0;
+
+        HashMap<SccSnippetId, HashSet<SccSnippetId>> clones = new HashMap<SccSnippetId, HashSet<SccSnippetId>>();
+
+        long t1 = System.currentTimeMillis();
+
+        while (scanner.hasNextLine()) {
+            String line = scanner.nextLine();
+            assert(line.matches("[0-9]+,[0-9]+,[0-9]+,[0-9]+"));
+            String[] numbers = line.split(",");
+            SccSnippetId id1 = new SccSnippetId(Integer.parseInt(numbers[0]), Integer.parseInt(numbers[1]));
+            SccSnippetId id2 = new SccSnippetId(Integer.parseInt(numbers[2]), Integer.parseInt(numbers[3]));
+
+            HashSet<SccSnippetId> id1Clones = clones.get(id1);
+            HashSet<SccSnippetId> id2Clones = clones.get(id2);
+
+            if (id1Clones == id2Clones) {
+                if (id1Clones != null) {
+                    /// We already had them marked as clones
+                } else {
+                    /// Create a new clone set with this data
+                    HashSet<SccSnippetId> newSet = new HashSet<SccSnippetId>();
+                    newSet.add(id1);
+                    newSet.add(id2);
+                    clones.put(id1, newSet);
+                    clones.put(id2, newSet);
+                }
+            } else {
+                /// Merge the sets as they are both clones, and point both to same set
+                if (id1Clones == null) {
+                    id2Clones.add(id1);
+                    clones.put(id1, id2Clones);
+                } else if (id2Clones == null) {
+                    id1Clones.add(id2);
+                    clones.put(id2, id1Clones);
+                } else {
+                    id1Clones.addAll(id2Clones);
+                    clones.put(id2, id1Clones);
+                }
+            }
+
+            numRead++;
+            if (0 == numRead%10000) {
+                long t2 = System.currentTimeMillis();
+                System.out.println(numRead + " clone pairs read in " + (t2 - t1) + " ms");
+                t1 = t2;
+            }
+        }
+
+        List<List<SccSnippetId>> result = new ArrayList<List<SccSnippetId>>();
+
+        HashSet<HashSet<SccSnippetId>> alreadyIncluded = new HashSet<HashSet<SccSnippetId>>();
+        for(HashSet<SccSnippetId> list : clones.values()) {
+            if (alreadyIncluded.contains(list) == false) {
+                result.add(new ArrayList<SccSnippetId>(list));
+                alreadyIncluded.add(list);
+            }
+        }
+
+        scanner.close();
+        return result;
+    }
 	
-	private Map<SnippetCode, List<Snippet>> getCloneMap(List<List<SccSnippetId>> clones)
+
+    private Map<SnippetCode, List<Snippet>> getCloneMap(List<List<SccSnippetId>> clones)
 			throws FileNotFoundException {
 		Map<SnippetCode, List<Snippet>> result = new HashMap<SnippetCode, List<Snippet>>(clones.size());
 		Set<SccSnippetId> snippetIdsToAdd = notebookNumbers.keySet();
diff --git a/Programs/src/notebooks/SccSnippetId.java b/Programs/src/notebooks/SccSnippetId.java
index 4468232..7d197bb 100644
--- a/Programs/src/notebooks/SccSnippetId.java
+++ b/Programs/src/notebooks/SccSnippetId.java
@@ -24,4 +24,8 @@ public boolean equals(Object other) {
 	public int hashCode() {
 		return Objects.hash(nbID, snippetID);
 	}
+
+    public String toString() {
+        return this.nbID + "@" + this.snippetID;
+    }
 }

From d9f15db1a527f6ba4d17802e15463168d5794e1a Mon Sep 17 00:00:00 2001
From: malinkallen <github.ed5b79@m.insidarum.se>
Date: Tue, 16 Jun 2020 16:41:07 +0200
Subject: [PATCH 2/3] Verify that the optimization of getCloneLists doesn't
 produce duplicated clone groups.

---
 Programs/test/data/scc/clone_pairs_clopt      |  6 +++++
 Programs/test/data/scc/file_stats_clopt       |  4 +++
 .../test/notebooks/SccOutputAnalyzerTest.java | 27 +++++++++++++++++--
 3 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 Programs/test/data/scc/clone_pairs_clopt
 create mode 100644 Programs/test/data/scc/file_stats_clopt

diff --git a/Programs/test/data/scc/clone_pairs_clopt b/Programs/test/data/scc/clone_pairs_clopt
new file mode 100644
index 0000000..4f80e63
--- /dev/null
+++ b/Programs/test/data/scc/clone_pairs_clopt
@@ -0,0 +1,6 @@
+1,1,1,2
+1,3,1,4
+1,1,1,3
+1,4,1,1
+1,2,1,3
+1,2,1,4
diff --git a/Programs/test/data/scc/file_stats_clopt b/Programs/test/data/scc/file_stats_clopt
new file mode 100644
index 0000000..6941461
--- /dev/null
+++ b/Programs/test/data/scc/file_stats_clopt
@@ -0,0 +1,4 @@
+1,1,"/path/to/nb_1.zip/nb_1_0.py","NULL/.py","abc",X,X,X,10
+1,2,"/path/to/nb_2.zip/nb_2_0.py","NULL/.py","def",X,X,X,10
+1,3,"/path/to/nb_2.zip/nb_2_1.py","NULL/.py","ghi",X,X,X,10
+1,4,"/path/to/nb_1.zip/nb_1_1.py","NULL/.py","jkl",X,X,X,10
diff --git a/Programs/test/notebooks/SccOutputAnalyzerTest.java b/Programs/test/notebooks/SccOutputAnalyzerTest.java
index d6e39e4..d7d0667 100644
--- a/Programs/test/notebooks/SccOutputAnalyzerTest.java
+++ b/Programs/test/notebooks/SccOutputAnalyzerTest.java
@@ -7,6 +7,7 @@
 
 public class SccOutputAnalyzerTest {
 	private SccOutputAnalyzer analyzer;
+	private final static String hashPattern = "[0-9,a-f]+";
 	private final static String notebookNamePattern = "nb_[0-9]+\\.ipynb";
 	
 	@Before
@@ -241,7 +242,7 @@ public void testLocComputation_odd() throws IOException {
 		
 		String[] expectedLines = {
 				hash2filesHeader(),
-				"[0-9,a-f]+, 13, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+"
+				hashPattern + ", 13, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+"
 		};
 
 		analyzer.clones(statsFile, reproMap, pairFile);
@@ -263,7 +264,7 @@ public void testLocComputation_even() throws IOException {
 		
 		String[] expectedLines = {
 				hash2filesHeader(),
-				"[0-9,a-f]+, 16, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, nb_[0-9]\\.ipynb, [0-9]"
+				hashPattern + ", 16, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+"
 		};
 
 		analyzer.clones(statsFile, reproMap, pairFile);
@@ -271,6 +272,28 @@ public void testLocComputation_even() throws IOException {
 		TestUtils.deleteCloneCsvs();
 	}
 	
+	/**
+	 * Verify that a clone group is only considered once after the optimization
+	 * of getCloneLists.
+	 * @throws IOException
+	 */
+	@Test
+	public void testCloneListsOptimization() throws IOException {
+		String dataDir = "test/data/scc";
+		String statsFile = dataDir + "/file_stats_clopt";
+		String pairFile = dataDir + "/clone_pairs_clopt";
+		String reproFile = "test/data/hash/repros.csv";
+		
+		String[] expectedLines = {
+			hash2filesHeader(),
+			hashPattern + ", 10, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+"
+		};
+		
+		analyzer.clones(statsFile, reproFile, pairFile);
+		TestUtils.checkCsv_matches("hash2filesA", expectedLines);
+		TestUtils.deleteCloneCsvs();
+	}
+	
 	/**
 	 * Verify that an AssertionError is thrown when the clone pairs file is on
 	 * the wrong format.

From fa18651efef7f8b1f7257520c037b9a77b44d25d Mon Sep 17 00:00:00 2001
From: Tobias Wrigstad <tobias.wrigstad@it.uu.se>
Date: Mon, 25 May 2020 00:48:38 +0200
Subject: [PATCH 3/3] Still in dire need of refactoring

---
 Programs/build.xml                            |   2 +-
 Programs/runner.sh                            |   1 +
 Programs/src/notebooks/Analyzer.java          |  56 +-
 Programs/src/notebooks/CloneFileWriter.java   |  74 +-
 Programs/src/notebooks/NotebookFile.java      | 137 +++
 Programs/src/notebooks/Repository.java        |  74 ++
 Programs/src/notebooks/SccOutputAnalyzer.java | 991 +++++++++++-------
 Programs/src/notebooks/SccSnippetId.java      | 107 +-
 Programs/src/notebooks/ThreadExecutor.java    |   6 +-
 Programs/src/notebooks/Utils.java             |   1 +
 10 files changed, 1028 insertions(+), 421 deletions(-)
 create mode 100644 Programs/runner.sh
 create mode 100644 Programs/src/notebooks/NotebookFile.java
 create mode 100644 Programs/src/notebooks/Repository.java

diff --git a/Programs/build.xml b/Programs/build.xml
index 7529504..e8bf72b 100644
--- a/Programs/build.xml
+++ b/Programs/build.xml
@@ -27,7 +27,7 @@
 
 	<target name="test" depends="buildTest">
 		<mkdir dir="reports"/>
-		<junit fork="true" printsummary="yes" haltonfailure="yes">
+		<junit fork="true" printsummary="on" showoutput="yes" haltonfailure="no">
 			<classpath>
 				<pathelement location="${bin}"/>
 				<pathelement location="${junit.path}"/>
diff --git a/Programs/runner.sh b/Programs/runner.sh
new file mode 100644
index 0000000..f3b6ccd
--- /dev/null
+++ b/Programs/runner.sh
@@ -0,0 +1 @@
+java -ea -XX:+UseParallelGC -cp bin:external/json-simple-1.1.1.jar -Xms6G -Xmx20G notebooks.SccOutputAnalyzer --repro_file=/home/maka4186/notebook_disk/notebook-number_repo.csv --stats_file=/home/maka4186/notebook_disk/SourcererCC_output/files.stats --pair_file=/home/maka4186/notebook_disk/SourcererCC_output/clone.pairs.only.numbers --output_dir=OutputSCC2
diff --git a/Programs/src/notebooks/Analyzer.java b/Programs/src/notebooks/Analyzer.java
index 482c5bd..d8f7f42 100644
--- a/Programs/src/notebooks/Analyzer.java
+++ b/Programs/src/notebooks/Analyzer.java
@@ -4,7 +4,7 @@
 import java.io.FileNotFoundException;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.Scanner;
+import java.io.*;
 
 public class Analyzer {
 	protected String outputDir = ".";
@@ -14,26 +14,38 @@ public class Analyzer {
 	 * @param fileName Name of file with mapping from notebook number to repro
 	 * @return The map from notebook name to repro
 	 */
-	protected static Map<String, String> createReproMap(String fileName)
-			throws FileNotFoundException {
-		Map<String, String> result = new HashMap<String, String>();
-		Scanner scanner = new Scanner(new File(fileName));
-		while (scanner.hasNextLine()) {
-			String line = scanner.nextLine();
-			String[] subStrings = line.split(",");
-			try {
-				int notebookNumber = Integer.parseInt(subStrings[0]);
-				String notebookName = "nb_" + notebookNumber + ".ipynb";
-				String reproName = subStrings[1];
-				result.put(notebookName, reproName);
-			} catch (NumberFormatException e) {
-				System.err.println("Notebook numbers in repro file must be integers! Notebook with \"number\" '"
-						+ subStrings[0] + "' is excluded from mapping!");
-			}
-		}
-		scanner.close();
-		return result;
-	}
+    protected static Map<String, String> createReproMap(String fileName)
+        throws FileNotFoundException {
+        Map<String, String> result = new HashMap<String, String>();
+        try {
+            BufferedReader input = new BufferedReader(new FileReader(new File(fileName)));
+
+            while (true) {
+                final String line = input.readLine();
+                if (line == null) break;
+                
+                String[] subStrings = line.split(",");
+                try {
+                    int notebookNumber = Integer.parseInt(subStrings[0]);
+                    StringBuilder sb = new StringBuilder();
+                    sb.append("nb_");
+                    sb.append(notebookNumber);
+                    sb.append(".ipynb");
+                    String notebookName = sb.toString();
+                    String reproName = subStrings[1];
+                    result.put(notebookName, reproName);
+                } catch (NumberFormatException e) {
+                    System.err.println("Notebook numbers in repro file must be integers! Notebook with \"number\" '"
+                                       + subStrings[0] + "' is excluded from mapping!");
+                }
+            }
+
+            input.close();
+        } catch (IOException e) {
+            e.printStackTrace(System.err);
+        }
+        return result;
+    }
 	
 	/**
 	 * Get the part of arg located after the (first) '=' sign. If the '=' is
@@ -48,4 +60,4 @@ protected String getValueFromArgument(String arg) {
 			return arg.substring(eqIndex + 1);
 		}
 	}
-}
\ No newline at end of file
+}
diff --git a/Programs/src/notebooks/CloneFileWriter.java b/Programs/src/notebooks/CloneFileWriter.java
index 2bbb263..ddc65f2 100644
--- a/Programs/src/notebooks/CloneFileWriter.java
+++ b/Programs/src/notebooks/CloneFileWriter.java
@@ -14,6 +14,7 @@
 import java.util.concurrent.Future;
 
 public class CloneFileWriter {
+    private LocalDateTime startTime = LocalDateTime.now();
 	private String outputDir;
 	
 	public CloneFileWriter(String outputDir) {
@@ -50,7 +51,7 @@ public void write(Map<Notebook, SnippetCode[]> file2hashes,
 	}
 	
 	private void printFile2hashes(Map<Notebook, SnippetCode[]> file2hashes) throws IOException {
-		Writer writer = new FileWriter(outputDir + "/file2hashesA" + LocalDateTime.now() + ".csv");
+		Writer writer = new FileWriter(outputDir + "/file2hashesA" + startTime + ".csv");
 		writer.write(file2hashesHeader());
 		for (Notebook notebook: file2hashes.keySet()) {
 			writer.write(notebook.getName());
@@ -64,7 +65,7 @@ private void printFile2hashes(Map<Notebook, SnippetCode[]> file2hashes) throws I
 	}
 	
 	private void printHash2files(Map<SnippetCode, List<Snippet>> hash2files) throws IOException {
-		Writer writer = new FileWriter(outputDir + "/hash2filesA" + LocalDateTime.now() + ".csv");
+		Writer writer = new FileWriter(outputDir + "/hash2filesA" + startTime + ".csv");
 		writer.write(hash2filesHeader());
 		for (SnippetCode code: hash2files.keySet()) {
 			writer.write(code.getHash() + ", " + code.getLOC());
@@ -78,7 +79,7 @@ private void printHash2files(Map<SnippetCode, List<Snippet>> hash2files) throws
 	
 	private void printCloneFrequencies(Map<Notebook, SnippetCode[]> file2hashes,
 			Map<SnippetCode, List<Snippet>> hash2files) throws IOException {
-		Writer writer = new FileWriter(outputDir + "/cloneFrequency" + LocalDateTime.now() + ".csv");
+		Writer writer = new FileWriter(outputDir + "/cloneFrequency" + startTime + ".csv");
 		writer.write(cloneFrequencyHeader());
 		for (Notebook notebook: file2hashes.keySet()) {
 			int numClones = 0, numUnique = 0, numClonesNE = 0;
@@ -146,32 +147,47 @@ private void printCloneFrequencies(Map<Notebook, SnippetCode[]> file2hashes,
 	 * @param hash2files Mapping from snippets to position in notebooks
 	 * @param NUM_NOTEBOOKS Maximum number of notebooks to print connection information for
 	 */
-	private void printConnectionsFile(Map<Notebook, SnippetCode[]> file2hashes,
-			Map<SnippetCode, List<Snippet>> hash2files, final int NUM_CONNECTIONS) throws IOException {
-		Writer writer = new FileWriter(outputDir + "/connections" + LocalDateTime.now() + ".csv");
-		writer.write(connectionsHeader());
-		List<Notebook> notebooks = new ArrayList<Notebook>(file2hashes.keySet());
-		Collections.shuffle(notebooks);
-		int connectionsToPrint = Math.min(NUM_CONNECTIONS, file2hashes.size());
-		List<Callable<String>> tasks = new ArrayList<Callable<String>>(connectionsToPrint);
-		for (int i=0; i<connectionsToPrint; i++) {
-			boolean heartBeat = 0 == i%10000;
-			tasks.add(new ConnectionsLineBuilder(notebooks.get(i), file2hashes, hash2files, heartBeat));
-		}
-		List<Future<String>> result = ThreadExecutor.getInstance().invokeAll(tasks);
-		for (int i=0; i<connectionsToPrint; i++) {
-			try {
-				writer.write(result.get(i).get());
-			} catch (InterruptedException e) {
-				System.err.println("Printing of connections for notebook " + notebooks.get(i).getName()
-						+ " was interrupted! " + e.getMessage());
-			} catch (ExecutionException e) {
-				System.err.println("Printing connections for notebook "
-						+ notebooks.get(i).getName() + " failed!" + e.toString());
-			}
-		}
-		writer.close();
-	}
+    private void printConnectionsFile(final Map<Notebook, SnippetCode[]> file2hashes,
+                                      final Map<SnippetCode, List<Snippet>> hash2files,
+                                      final int NUM_CONNECTIONS) throws IOException {
+
+        Writer writer = new FileWriter(outputDir + "/connections" + startTime + ".csv");
+        writer.write(connectionsHeader());
+        final List<Notebook> notebooks = new ArrayList<Notebook>(file2hashes.keySet());
+        ///Collections.shuffle(notebooks);
+        final int connectionsToPrint = Math.min(NUM_CONNECTIONS, file2hashes.size());
+
+        // List<Callable<String>> tasks = new ArrayList<Callable<String>>(connectionsToPrint);
+        // for (int i=0; i<connectionsToPrint; i++) {
+        //     boolean heartBeat = true; //0 == i%100000000;
+        //     tasks.add(new ConnectionsLineBuilder(notebooks.get(i), file2hashes, hash2files, heartBeat));
+        // }
+
+        List<Callable<Void>> tasks = new ArrayList<Callable<Void>>(connectionsToPrint);
+        for (int i=0; i < 8; i++) {
+            boolean heartBeat = true; //0 == i%100000000;
+            final int start = i;
+            tasks.add(() -> {
+                          for(int j = start; j < connectionsToPrint; j += 8) {
+                              new ConnectionsLineBuilder(notebooks.get(j), file2hashes, hash2files, heartBeat).call();
+                          }
+                          return null;
+                });
+        }
+        List<Future<Void>> result = ThreadExecutor.getInstance().invokeAll(tasks);
+        // for (int i=0; i<connectionsToPrint; i++) {
+        //     try {
+        //         writer.write(result.get(i).get());
+        //     } catch (InterruptedException e) {
+        //         System.err.println("Printing of connections for notebook " + notebooks.get(i).getName()
+        //                            + " was interrupted! " + e.getMessage());
+        //     } catch (ExecutionException e) {
+        //         System.err.println("Printing connections for notebook "
+        //                            + notebooks.get(i).getName() + " failed!" + e.toString());
+        //     }
+        // }
+        // writer.close();
+    }
 	
 	/**
 	 * Look in clones to decide whether snippet is a clone or a unique snippet
diff --git a/Programs/src/notebooks/NotebookFile.java b/Programs/src/notebooks/NotebookFile.java
new file mode 100644
index 0000000..4f48b8a
--- /dev/null
+++ b/Programs/src/notebooks/NotebookFile.java
@@ -0,0 +1,137 @@
+package notebooks;
+
+import java.util.function.Consumer;
+
+class NotebookFile {
+    private static NotebookFile[] directory = new NotebookFile[2800000]; /// FIXME
+
+    public static void register(int id, int notebookNumber) {
+        if (NotebookFile.directory[id] != null) return;
+
+        final NotebookFile nb = new NotebookFile(notebookNumber); 
+        NotebookFile.directory[id] = nb;
+    }
+
+    public static void forAll(Consumer<NotebookFile> f) {
+        for (NotebookFile nb : directory) {
+            if (nb != null) {
+                f.accept(nb);
+            }
+        }
+    }
+    
+    public static NotebookFile getById(int id) {
+        assert (0 <= id && id < NotebookFile.directory.length) : "Illegal repo id " + id;
+        assert (NotebookFile.directory[id] != null) : "Tried to lookup non-existing NotebookFile (id=" + id + ")";
+
+        return NotebookFile.directory[id];
+    }
+
+    private final int id;
+    /// Connections between snippets where inter/intra denotes different/same notebooks
+    private int intraConnections = 0;
+    private int intraEmptyConnections = 0;
+    private int interConnections = 0;
+    private int interEmptyConnections = 0;
+    /// Connections between snippets where inter/intra denotes different/same repo
+    private int intraRepoConnections = 0;
+    private int intraRepoEmptyConnections = 0;
+    private int interRepoConnections = 0;
+    private int interRepoEmptyConnections = 0;
+
+    public Repository getRepo() {
+        return Repository.getByNotebookNumber(this.id);
+    }
+    
+    public NotebookFile(int id) {
+        this.id = id;
+    }
+
+    public void addIntraConnections(boolean empty) {
+        addIntraConnections(1, empty);
+    }
+
+    public void addInterConnections(boolean empty) {
+        addInterConnections(1, empty);
+    }
+
+    public void addIntraConnections(int value, boolean empty) {
+        if (empty) {
+            intraEmptyConnections += value;
+        } else {
+            intraConnections += value;
+        }
+    }
+
+    public void addInterConnections(int value, boolean empty) {
+        if (empty) {
+            interEmptyConnections += value;
+        } else {
+            interConnections += value;
+        }
+    }
+
+    public int intraConnections() {
+        return intraConnections;
+    }
+
+    public int intraEmptyConnections() {
+        return intraEmptyConnections;
+    }
+
+    public int interConnections() {
+        return interConnections;
+    }
+
+    public int interEmptyConnections() {
+        return interEmptyConnections;
+    }
+
+    public void addIntraRepoConnections(boolean empty) {
+        addIntraRepoConnections(1, empty);
+    }
+
+    public void addInterRepoConnections(boolean empty) {
+        addInterRepoConnections(1, empty);
+    }
+
+    public void addIntraRepoConnections(int value, boolean empty) {
+        if (empty) {
+            intraRepoEmptyConnections += value;
+        } else {
+            intraRepoConnections += value;
+        }
+    }
+
+    public void addInterRepoConnections(int value, boolean empty) {
+        if (empty) {
+            interRepoEmptyConnections += value;
+        } else {
+            interRepoConnections += value;
+        }
+    }
+
+    public int intraRepoConnections() {
+        return intraRepoConnections;
+    }
+
+    public int intraRepoEmptyConnections() {
+        return intraRepoEmptyConnections;
+    }
+
+    public int interRepoConnections() {
+        return interRepoConnections;
+    }
+
+    public int interRepoEmptyConnections() {
+        return interRepoEmptyConnections;
+    }
+
+    public String fileName() {
+        return new StringBuilder()
+            .append("nb_")
+            .append(this.id)
+            .append(".ipynb")
+            .toString();
+    }
+}
diff --git a/Programs/src/notebooks/Repository.java b/Programs/src/notebooks/Repository.java
new file mode 100644
index 0000000..6eb4e3c
--- /dev/null
+++ b/Programs/src/notebooks/Repository.java
@@ -0,0 +1,74 @@
+package notebooks;
+
+import java.util.HashMap;
+
+public class Repository {
+    private static HashMap<Integer, Repository> urlRepo = new HashMap<Integer, Repository>();
+    private static HashMap<Integer, Repository> notebookRepo = new HashMap<Integer, Repository>();
+
+    public static void register(int nb, String url) {
+        Repository r = getByURL(url);
+
+        if (r == null) {
+            r = new Repository(); /// Todo: should we save url?
+            Repository.urlRepo.put(url.hashCode(), r); 
+        }
+        
+        Repository.notebookRepo.put(nb, r);
+    }
+    
+    public static Repository getByNotebookNumber(int nb) {
+        return Repository.notebookRepo.get(nb);
+    }
+    
+    public static Repository getByURL(String url) {
+        return Repository.urlRepo.get(url.hashCode());
+    }
+    
+    private Repository() {} 
+    
+    private int intraConnections = 0;
+    private int intraEmptyConnections = 0;
+    private int interConnections = 0;
+    private int interEmptyConnections = 0;
+
+    public int intraConnections() {
+        return intraConnections;
+    }
+
+    public int intraEmptyConnections() {
+        return intraEmptyConnections;
+    }
+
+    public int interConnections() {
+        return interConnections;
+    }
+
+    public int interEmptyConnections() {
+        return interEmptyConnections;
+    }
+    
+    public void addIntraConnections(boolean empty) {
+        addIntraConnections(1, empty);
+    }
+
+    public void addInterConnections(boolean empty) {
+        addInterConnections(1, empty);
+    }
+
+    public void addIntraConnections(int value, boolean empty) {
+        if (empty) {
+            ++intraEmptyConnections;
+        } else {
+            ++intraConnections;
+        }
+    }
+
+    public void addInterConnections(int value, boolean empty) {
+        if (empty) {
+            interEmptyConnections += value;
+        } else {
+            interConnections += value;
+        }
+    }
+}
diff --git a/Programs/src/notebooks/SccOutputAnalyzer.java b/Programs/src/notebooks/SccOutputAnalyzer.java
index 0f479fd..28913d1 100644
--- a/Programs/src/notebooks/SccOutputAnalyzer.java
+++ b/Programs/src/notebooks/SccOutputAnalyzer.java
@@ -10,391 +10,668 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Scanner;
+import java.io.*;
 import java.util.Set;
+import java.util.Locale;
 
 public class SccOutputAnalyzer extends Analyzer {
-	// Information about each snippet
-	Map<SccSnippetId, Integer> notebookNumbers;
-	Map<SccSnippetId, Integer> snippetIndices;
-	Map<SccSnippetId, Integer> linesOfCode;
-	// Information about each notebook
-	private Map<String, String> repros = null;
-	Map<String, Integer> snippetsPerNotebook = null;
-	
-	/**
-	 * Perform the clone analysis based on SourcererCC output files. Write
-	 * file2hashesA<current-date-time>.csv, hash2filesA<current-date-time>.csv,
-	 * cloneFrequencies<current-date-time>.csv and
-	 * connections<current-date-time>.csv accordingly.
-	 * This methods initializes snippet and repro information, so you shouldn't
-	 * do it explicitly before the call to this method.
-	 * Note that the ''hashes'' written by this method are not the MD5 hashes
-	 * of the snippets, but just the value of a counter. However, all instances
-	 * of the ''hash'' of a snippet are the same.
-	 * @param statsFile Path to file stats file produced by the SourcererCC tokenizer
-	 * @param reproFile Path to file with mapping from notebook number to repro
-	 * @param pairFile: Path to output file with clone pairs from the SourcererCC clone detection
-	 * @return A map from snippets to files
-	 * @throws IOException
-	 */
-	public Map<SnippetCode, List<Snippet>> clones(String statsFile, String reproFile, String pairFile) throws IOException {
-		initializeSnippetInfo(statsFile);
-		initializeReproMap(reproFile);
-		return clones(pairFile);
-	}
+    private static final long startTimeStamp = System.currentTimeMillis();
+    public static void printTimeStampedMsg(String msg) {
+        StringBuilder sb = new StringBuilder();
+        sb.append(">>> (");
+        sb.append((System.currentTimeMillis() - startTimeStamp) / 1000);
+        sb.append(" sec) ");
+        sb.append(msg);
+        System.err.println(sb.toString());
+    }
+
+    // Information about each snippet
+    Map<SccSnippetId, Integer> notebookNumbers;
+    Map<SccSnippetId, Integer> snippetIndices;
+    Map<SccSnippetId, Integer> linesOfCode;
+    // Information about each notebook
+    private Map<String, String> repros = null;
+    Map<String, Integer> snippetsPerNotebook = null;
 	
-	/**
-	 * Perform the clone analysis based on SourcererCC output files. Write
-	 * file2hashesA<current-date-time>.csv, hash2filesA<current-date-time>.csv,
-	 * cloneFrequencies<current-date-time>.csv and
-	 * connections<current-date-time>.csv accordingly.
-	 * Note that you have to initialize the snippet and repro information, by
-	 * calling initializeSnippetInfo and initializeReproMap respectively before
-	 * calling this method!
-	 * Note that the ''hashes'' written by this method are not the MD5 hashes
-	 * of the snippets, but just the value of a counter. However, all instances
-	 * of the ''hash'' of a snippet are the same.
-	 * @param pairFile: Path to output file with clone pairs from the SourcererCC clone detection
-	 * @return A map from snippets to files
-	 * @throws IOException
-	 */
-	public Map<SnippetCode, List<Snippet>> clones(String pairFile) throws IOException {
-		System.out.println("Analyzing clones based on SourcererCC output files!");
-		System.out.println("NOTE THAT NOTEBOOKS WITHOUT SNIPPETS ARE NOT INCLUDED");
-		System.out.println("since they are not included in the SourcererCC data!");
-		Map<SnippetCode, List<Snippet>> snippet2file = getClones(pairFile);
-		Map<Notebook, SnippetCode[]> file2snippet = getSnippets(snippet2file);
-		new CloneFileWriter(outputDir).write(file2snippet, snippet2file);
-		return snippet2file;
-	}
+    /**
+     * Perform the clone analysis based on SourcererCC output files. Write
+     * file2hashesA<current-date-time>.csv, hash2filesA<current-date-time>.csv,
+     * cloneFrequencies<current-date-time>.csv and
+     * connections<current-date-time>.csv accordingly.
+     * This methods initializes snippet and repro information, so you shouldn't
+     * do it explicitly before the call to this method.
+     * Note that the ''hashes'' written by this method are not the MD5 hashes
+     * of the snippets, but just the value of a counter. However, all instances
+     * of the ''hash'' of a snippet are the same.
+     * @param statsFile Path to file stats file produced by the SourcererCC tokenizer
+     * @param reproFile Path to file with mapping from notebook number to repro
+     * @param pairFile: Path to output file with clone pairs from the SourcererCC clone detection
+     * @return A map from snippets to files
+     * @throws IOException
+     */
+    public Map<SnippetCode, List<Snippet>> clones(String statsFile, String reproFile, String pairFile) throws IOException {
+        // initializeSnippetInfo(statsFile);
+        // initializeReproMap(reproFile);
+        registerRepos(reproFile);
+        registerSnippets(statsFile);
+        return clones(pairFile);
+    }
 	
-	/**
-	 * Initialize repro information for each notebook.
-	 * @param fileName Path to file with mapping from notebook number to repro
-	 */
-	public void initializeReproMap(String fileName) throws FileNotFoundException {
-		repros = createReproMap(fileName);
-	}
+    /**
+     * Perform the clone analysis based on SourcererCC output files. Write
+     * file2hashesA<current-date-time>.csv, hash2filesA<current-date-time>.csv,
+     * cloneFrequencies<current-date-time>.csv and
+     * connections<current-date-time>.csv accordingly.
+     * Note that you have to initialize the snippet and repro information, by
+     * calling initializeSnippetInfo and initializeReproMap respectively before
+     * calling this method!
+     * Note that the ''hashes'' written by this method are not the MD5 hashes
+     * of the snippets, but just the value of a counter. However, all instances
+     * of the ''hash'' of a snippet are the same.
+     * @param pairFile: Path to output file with clone pairs from the SourcererCC clone detection
+     * @return A map from snippets to files
+     * @throws IOException
+     */
+    public Map<SnippetCode, List<Snippet>> clones(String pairFile) throws IOException {
+        System.out.println("Analyzing clones based on SourcererCC output files!");
+        System.out.println("NOTE THAT NOTEBOOKS WITHOUT SNIPPETS ARE NOT INCLUDED");
+        System.out.println("since they are not included in the SourcererCC data!");
+        Map<SnippetCode, List<Snippet>> snippet2file = getClones(pairFile);
+        Map<Notebook, SnippetCode[]> file2snippet = getSnippets(snippet2file);
+        new CloneFileWriter(outputDir).write(file2snippet, snippet2file);
+        return snippet2file;
+    }
 	
-	/**
-	 * Initialize the maps containing information about each snippet
-	 * @param statsFile Path to file stats file produced by the SourcererCC tokenizer
-	 * @throws FileNotFoundException If the stats file doesn't exist
-	 */
-	public void initializeSnippetInfo(String statsFile) throws FileNotFoundException {
-		Scanner statsScanner = new Scanner(new File(statsFile));
-		notebookNumbers = new HashMap<SccSnippetId, Integer>();
-		snippetIndices = new HashMap<SccSnippetId, Integer>();
-		linesOfCode = new HashMap<SccSnippetId, Integer>();
-		snippetsPerNotebook = new HashMap<String, Integer>();
-		while(statsScanner.hasNextLine()) {
-			String line = statsScanner.nextLine();
-			String[] columns = line.split(",");
-			int id1 = Integer.parseInt(columns[0]);
-			int id2 = Integer.parseInt(columns[1]);
-			SccSnippetId id = new SccSnippetId(id1, id2);
-			String path = columns[2];
-			// Remove directories from filename
-			String snippetFileName = path.substring(path.lastIndexOf('/') + 1);
-			// Remove suffix
-			snippetFileName = snippetFileName.substring(0, snippetFileName.lastIndexOf('.'));
-			String[] snippetSubStrings = snippetFileName.split("_");
-			int notebookNumber = Integer.parseInt(snippetSubStrings[1]);
-			String notebookName = getNotebookNameFromNumber(notebookNumber);
-			addOrIncrease(snippetsPerNotebook, notebookName);
-			notebookNumbers.put(id, notebookNumber);
-			snippetIndices.put(id, Integer.parseInt(snippetSubStrings[2]));
-			/* Here we use the number of lines of source code (comments
-			   excluded), which is inconsistent with the clone analysis of the 
-			   notebook files, but so is the clone detection -SourcererCC
-			   doesn't consider comments in clone analysis. */
-			int loc = Integer.parseInt(columns[8]);
-			linesOfCode.put(id, loc);
-		}
-		statsScanner.close();
-	}
+    /**
+     * Initialize repro information for each notebook.
+     * @param fileName Path to file with mapping from notebook number to repro
+     */
+    public void initializeReproMap(String fileName) throws FileNotFoundException {
+        repros = createReproMap(fileName);
+    }
+
+    public void registerRepos(String fileName) throws FileNotFoundException {
+         try (BufferedReader f = new BufferedReader(new FileReader(new File(fileName)))) {
+            for(String line = f.readLine(); line != null; line = f.readLine()) {
+                Repository.register(Integer.parseInt(line.substring(0, line.indexOf(','))),    // Notebook ID
+                                    line.substring(line.indexOf(',') + 1));                    // URL
+            }
+
+        } catch (IOException e) {
+            e.printStackTrace(System.err);
+        }
+    }
 	
-	/**
-	 * If map contains a value for key, increase it with 1. Else add an entry
-	 * with for key with the value 1.
-	 * @param map Map to modify as stated above
-	 * @param key Key for the entry that will be changed/added
-	 */
-	private void addOrIncrease(Map<String, Integer> map, String key) {
-		if (map.containsKey(key)) {
-			map.put(key, map.get(key) + 1);
-		} else {
-			map.put(key, 1);
-		}
-	}
-
-	/**
-	 * Create a mapping from snippets to notebooks (hash2files) using output
-	 * files from SourcererCC.
-	 */
-	private Map<SnippetCode, List<Snippet>> getClones(String pairFile) throws IOException {
-		// List<List<SccSnippetId>> clones = getCloneLists(pairFile);
-		// List<List<SccSnippetId>> clones2 = getCloneListsAlt(pairFile);
-
-    // HashSet<HashSet<SccSnippetId>> a = new HashSet<HashSet<SccSnippetId>>();
-    // HashSet<HashSet<SccSnippetId>> b = new HashSet<HashSet<SccSnippetId>>();
+    public void registerSnippets(String statsFile) throws FileNotFoundException {
+        try (BufferedReader f = new BufferedReader(new FileReader(new File(statsFile)))) {
+            for(String line = f.readLine(); line != null; line = f.readLine()) {
+                SccSnippetId.register(line);
+            }
+
+        } catch (IOException e) {
+            e.printStackTrace(System.err);
+        }
+    }
+
+    public void initializeSnippetInfo(String statsFile) throws FileNotFoundException {
+        BufferedReader input = new BufferedReader(new FileReader(new File(statsFile)));
+        notebookNumbers = new HashMap<SccSnippetId, Integer>();
+        snippetIndices = new HashMap<SccSnippetId, Integer>();
+        linesOfCode = new HashMap<SccSnippetId, Integer>();
+        snippetsPerNotebook = new HashMap<String, Integer>();
+        try {
+            while(true) {
+                final String line = input.readLine();
+
+                if (line == null) break;
+
+                String[] columns = line.split(",");
+                final SccSnippetId id = SccSnippetId
+                    .getByCommaSeparatedPair(line.substring(0, line.indexOf(',', line.indexOf(',') + 1)));
+          
+                String path = columns[2];
+                // Remove directories from filename
+                String snippetFileName = path.substring(path.lastIndexOf('/') + 1);
+                // Remove suffix
+                snippetFileName = snippetFileName.substring(0, snippetFileName.lastIndexOf('.'));
+                String[] snippetSubStrings = snippetFileName.split("_");
+                int notebookNumber = Integer.parseInt(snippetSubStrings[1]);
+                addOrIncrease(snippetsPerNotebook, "nb_" + snippetSubStrings[1] + ".ipynb");
+                notebookNumbers.put(id, notebookNumber);
+                snippetIndices.put(id, Integer.parseInt(snippetSubStrings[2]));
+                /* Here we use the number of lines of source code (comments
+                   excluded), which is inconsistent with the clone analysis of the 
+                   notebook files, but so is the clone detection -SourcererCC
+                   doesn't consider comments in clone analysis. */
+                int loc = Integer.parseInt(columns[8]);
+                linesOfCode.put(id, loc);
+            }
+            input.close();
+        } catch (IOException e) {
+            e.printStackTrace(System.err);
+        }
+    }
     
-    // for (List<SccSnippetId> il : clones) {
-    //     a.add(new HashSet<SccSnippetId>(il));
-    // }
-    // for (List<SccSnippetId> il : clones2) {
-    //     b.add(new HashSet<SccSnippetId>(il));
-    // }
-
-    // System.err.println("ALTERNATIVE " + a.equals(b));
     
-		List<List<SccSnippetId>> clones = getCloneListsAlt(pairFile);
-		return getCloneMap(clones);
-	}
+    /**
+     * If map contains a value for key, increase it with 1. Else add an entry
+     * with for key with the value 1.
+     * @param map Map to modify as stated above
+     * @param key Key for the entry that will be changed/added
+     */
+    private void addOrIncrease(Map<String, Integer> map, String key) {
+        if (map.containsKey(key)) {
+            map.put(key, map.get(key) + 1);
+        } else {
+            map.put(key, 1);
+        }
+    }
+
+    /**
+     * Create a mapping from snippets to notebooks (hash2files) using output
+     * files from SourcererCC.
+     */
+    private Map<SnippetCode, List<Snippet>> getClones(String pairFile) throws IOException {
+        List<List<SccSnippetId>> clones = getCloneLists(pairFile);
+
+        constructConnectionGraphInfo(clones);
+        // System.exit(0);
+        
+        return getCloneMap(clones);
+    }
 	
-	private List<List<SccSnippetId>> getCloneLists(String pairFile) throws FileNotFoundException {
-		List<List<SccSnippetId>> clones = new ArrayList<List<SccSnippetId>>();
-		Scanner scanner = new Scanner(new File(pairFile));
-		int numRead = 0;
-    long t1 = System.currentTimeMillis();
-		while (scanner.hasNextLine()) {
-			String line = scanner.nextLine();
-			assert(line.matches("[0-9]+,[0-9]+,[0-9]+,[0-9]+"));
-			String[] numbers = line.split(",");
-			SccSnippetId id1 = new SccSnippetId(Integer.parseInt(numbers[0]), Integer.parseInt(numbers[1]));
-			SccSnippetId id2 = new SccSnippetId(Integer.parseInt(numbers[2]), Integer.parseInt(numbers[3]));
-			boolean bothStored = false;
-			Iterator<List<SccSnippetId>> it = clones.iterator();
-			while (!bothStored && it.hasNext()) {
-				List<SccSnippetId> existing = it.next();
-				boolean id1stored = existing.contains(id1);
-				boolean id2stored = existing.contains(id2); 
-				if(id1stored && id2stored) {
-					bothStored = true;
-				} else if (id1stored && !id2stored) {
-					existing.add(id2);
-					bothStored = true;
-				} else if(id2stored && !id1stored) {
-					existing.add(id1);
-					bothStored = true;
-				}
-			}
-			if (!bothStored) {
-				List<SccSnippetId> newCloneList = new ArrayList<SccSnippetId>();
-				newCloneList.add(id1);
-				newCloneList.add(id2);
-				clones.add(newCloneList);
-			}
-			numRead++;
-			if (0 == numRead%10000) {
-          long t2 = System.currentTimeMillis();
-          System.out.println(numRead + " clone pairs read in " + (t2 - t1) + " ms");
-          t1 = t2;
-			}
-		}
-		scanner.close();
-		return clones;
-	}
-
-    private List<List<SccSnippetId>> getCloneListsAlt(String pairFile) throws FileNotFoundException {
-        Scanner scanner = new Scanner(new File(pairFile));
-        int numRead = 0;
-
-        HashMap<SccSnippetId, HashSet<SccSnippetId>> clones = new HashMap<SccSnippetId, HashSet<SccSnippetId>>();
-
-        long t1 = System.currentTimeMillis();
-
-        while (scanner.hasNextLine()) {
-            String line = scanner.nextLine();
-            assert(line.matches("[0-9]+,[0-9]+,[0-9]+,[0-9]+"));
-            String[] numbers = line.split(",");
-            SccSnippetId id1 = new SccSnippetId(Integer.parseInt(numbers[0]), Integer.parseInt(numbers[1]));
-            SccSnippetId id2 = new SccSnippetId(Integer.parseInt(numbers[2]), Integer.parseInt(numbers[3]));
-
-            HashSet<SccSnippetId> id1Clones = clones.get(id1);
-            HashSet<SccSnippetId> id2Clones = clones.get(id2);
-
-            if (id1Clones == id2Clones) {
-                if (id1Clones != null) {
-                    /// We already had them marked as clones
-                } else {
-                    /// Create a new clone set with this data
-                    HashSet<SccSnippetId> newSet = new HashSet<SccSnippetId>();
-                    newSet.add(id1);
-                    newSet.add(id2);
-                    clones.put(id1, newSet);
-                    clones.put(id2, newSet);
+    private List<List<SccSnippetId>> getCloneLists(String pairFile) throws FileNotFoundException {
+        File cachedResultsOfPreviousRun = new File("getCloneLists.result.txt");
+
+        if (cachedResultsOfPreviousRun.exists()) {
+            try (BufferedReader f = new BufferedReader(new FileReader(cachedResultsOfPreviousRun))) {
+                ArrayList<List<SccSnippetId>> result = new ArrayList<List<SccSnippetId>>();
+
+                for (String line = f.readLine(); line != null; line = f.readLine()) {
+                    ArrayList<SccSnippetId> innerResult = new ArrayList<SccSnippetId>();
+
+                    for (String entry : line.split(",")) {
+                        if (entry.length() > 0) {
+                            innerResult.add(SccSnippetId.getByPair(entry));
+                        }
+                    }
+
+                    result.add(innerResult);
                 }
-            } else {
-                /// Merge the sets as they are both clones, and point both to same set
-                if (id1Clones == null) {
-                    id2Clones.add(id1);
-                    clones.put(id1, id2Clones);
-                } else if (id2Clones == null) {
-                    id1Clones.add(id2);
-                    clones.put(id2, id1Clones);
-                } else {
-                    id1Clones.addAll(id2Clones);
-                    clones.put(id2, id1Clones);
+                
+                return result;
+            } catch (IOException e) {
+                /// fixme add printout
+                System.exit(-1);
+            }
+        } 
+
+        return computeCloneLists(pairFile);
+    }
+
+    private List<List<SccSnippetId>> computeCloneLists(String pairFile) throws FileNotFoundException {
+        HashMap<Integer, CloneGroup> clones = new HashMap<Integer, CloneGroup>();
+
+        long numRead = 0;
+        try {
+            final BufferedReader file = new BufferedReader(new FileReader(new File(pairFile)));
+
+            for (String line = file.readLine(); line != null; line = file.readLine()) {
+                CloneGroup.addToCloneList(clones, line);
+                ++numRead;
+
+                if ((numRead % 5000000) == 0) {
+                    CloneGroup.compact(clones);
+                }
+                
+                if ((numRead % 1000000) == 0) {
+                    printTimeStampedMsg(numRead + " clone pairs read");
                 }
             }
 
-            numRead++;
-            if (0 == numRead%10000) {
-                long t2 = System.currentTimeMillis();
-                System.out.println(numRead + " clone pairs read in " + (t2 - t1) + " ms");
-                t1 = t2;
+            file.close();
+            printTimeStampedMsg("Done reading clone pairs (" + clones.size() + ") keys in clones");
+
+            printTimeStampedMsg("calling convertResult");
+            List<List<SccSnippetId>> result = CloneGroup.convertResult(clones, SccSnippetId.directory);
+            printTimeStampedMsg("returning from convertResult");
+            clones = null;
+
+            FileWriter fw = new FileWriter(new File("getCloneLists.result.txt"));
+            for (List<SccSnippetId> list : result) {
+                for(SccSnippetId sid : list) {
+                    fw.write(sid.toString());
+                    fw.write(',');
+                }
+                fw.write('\n');
             }
+            fw.close();
+
+            return result;
+
+        } catch (IOException e) {
+            e.printStackTrace(System.err);
         }
 
-        List<List<SccSnippetId>> result = new ArrayList<List<SccSnippetId>>();
+        return null;
+    }
 
-        HashSet<HashSet<SccSnippetId>> alreadyIncluded = new HashSet<HashSet<SccSnippetId>>();
-        for(HashSet<SccSnippetId> list : clones.values()) {
-            if (alreadyIncluded.contains(list) == false) {
-                result.add(new ArrayList<SccSnippetId>(list));
-                alreadyIncluded.add(list);
-            }
+    class Counter {
+        int value = 1;
+        public Counter inc() {
+            this.value += 1;
+            return this;
         }
+    }
 
-        scanner.close();
-        return result;
+    private void postProcessRepoConnections(HashMap<NotebookFile, Counter> nbOccurrences, boolean emptyConnections) {
+        List<NotebookFile> noteBooks =
+            new ArrayList<NotebookFile>(nbOccurrences.keySet());
+
+        final int noteBooksSize = noteBooks.size();
+
+        if (noteBooksSize > 100000) {
+            printTimeStampedMsg("Skipping big (" + noteBooksSize + ") clonelist");
+            /// TODO: optimise
+            return;
+        }
+        
+        for (int i = 0; i < noteBooksSize; ++i) {
+            NotebookFile nbi = noteBooks.get(i);
+            final Repository ri = nbi.getRepo();
+
+            for (int j = i; j < noteBooksSize; ++j) {
+                NotebookFile nbj = noteBooks.get(j);
+                final Repository rj = nbj.getRepo();
+
+                if (ri == rj) {
+                    nbi.addIntraRepoConnections(nbOccurrences.get(nbj).value, emptyConnections);
+                    nbj.addIntraRepoConnections(nbOccurrences.get(nbi).value, emptyConnections);
+                } else {
+                    nbi.addInterRepoConnections(nbOccurrences.get(nbj).value, emptyConnections);
+                    nbj.addInterRepoConnections(nbOccurrences.get(nbi).value, emptyConnections);
+                }
+            }
+        }
     }
-	
+    
+    private void constructConnectionGraphInfo(List<List<SccSnippetId>> cloneLists) {
+        printTimeStampedMsg("constructConnectionGraphInfo start");
+
+        /// For each list of snippets considered clones ...
+        for (List<SccSnippetId> cloneList : cloneLists) {
+            final int totalConnections = cloneList.size();
+            
+            final HashMap<NotebookFile, Counter> notebookOccurrences = new HashMap<NotebookFile, Counter>();
+            final HashMap<NotebookFile, Counter> notebookOccurrencesEmptySnippets = new HashMap<NotebookFile, Counter>();
 
+            /// Count the number of times each notebook appears (separate empty and non-empty)
+            for (SccSnippetId sid : cloneList) {
+                final NotebookFile nb = sid.getNotebook();
+                final HashMap<NotebookFile, Counter> map = (sid.isEmpty()
+                                                            ? notebookOccurrencesEmptySnippets
+                                                            : notebookOccurrences);
+                /// if nb --> c exists, do c.inc() else, install new counter with value 1
+                map.compute(nb, (k, c) -> (c == null) ? new Counter() : c.inc());
+            }
+
+            for (Map.Entry<NotebookFile, Counter> kv : notebookOccurrences.entrySet()) {
+                NotebookFile nb = kv.getKey();
+                int intraConnections = kv.getValue().value;
+
+                nb.addIntraConnections(intraConnections, false);
+                nb.addInterConnections(totalConnections - intraConnections, false);
+            }
+
+            for (Map.Entry<NotebookFile, Counter> kv : notebookOccurrencesEmptySnippets.entrySet()) {
+                NotebookFile nb = kv.getKey();
+                int intraConnections = kv.getValue().value;
+
+                nb.addIntraConnections(intraConnections, true);
+                nb.addInterConnections(totalConnections - intraConnections, true);
+            }
+
+            postProcessRepoConnections(notebookOccurrences, false);
+            postProcessRepoConnections(notebookOccurrencesEmptySnippets, true);
+        }
+
+        printTimeStampedMsg("printing starts");
+        
+        NotebookFile.forAll(nb -> {
+                final float normalizedConnections = 0; // TODO
+                final float normalizedNonEmptyConnections = 0; // TODO
+                final float meanInterReproConnections = 0; // TODO
+                final float meanNonEmptyInterReproConnections = 0; // TODO
+
+                final int nonEmptyConnections = nb.intraConnections() + nb.interConnections();
+                final int connections = nonEmptyConnections + nb.intraEmptyConnections() + nb.interEmptyConnections();
+
+                final int nonEmptyIntraReproConnections = nb.intraRepoConnections();
+                final int intraReproConnections = nonEmptyIntraReproConnections + nb.intraRepoEmptyConnections();
+
+                String result = String.format(Locale.US,
+                                              "%s, %d, %.4f, %d, %.4f, %d, %d, %.4f, %.4f",
+                                              nb.fileName(),
+                                              connections,
+                                              normalizedConnections, 
+                                              nonEmptyConnections,
+                                              normalizedNonEmptyConnections,
+                                              intraReproConnections,
+                                              nonEmptyIntraReproConnections,
+                                              meanInterReproConnections,
+                                              meanNonEmptyInterReproConnections);
+
+                // TODO: write this to file instead
+                System.err.println(result);
+            });
+    }
+    
     private Map<SnippetCode, List<Snippet>> getCloneMap(List<List<SccSnippetId>> clones)
-			throws FileNotFoundException {
-		Map<SnippetCode, List<Snippet>> result = new HashMap<SnippetCode, List<Snippet>>(clones.size());
-		Set<SccSnippetId> snippetIdsToAdd = notebookNumbers.keySet();
-		int hashIndex = 0;
+        throws FileNotFoundException {
+        Map<SnippetCode, List<Snippet>> result = new HashMap<SnippetCode, List<Snippet>>(clones.size());
+        Set<SccSnippetId> snippetIdsToAdd = notebookNumbers.keySet();
+        int hashIndex = 0;
 		
-		// Cloned snippets
-		for (List<SccSnippetId> cloned: clones) {
-			if (0 == hashIndex%10000) {
-				System.out.println("Creating entry  for " + hashIndex + " in snippet-to-files-map.");
-			}
-			List<Snippet> snippets = new ArrayList<Snippet>();
-			int numClones = cloned.size();
-			List<Integer> loc = new ArrayList<Integer>(numClones);
-			for (int i=0; i<numClones; i++) {
-				SccSnippetId id = cloned.get(i);
-				addSnippet(id, snippets);
-				snippetIdsToAdd.remove(id);
-				loc.add(linesOfCode.get(cloned.get(i)));
-			}
-			int medianLoc = Utils.median(loc, "Different line count for snippet " + Integer.toString(hashIndex));
-			SnippetCode hash = new SnippetCode(medianLoc, Integer.toString(hashIndex++));
-			result.put(hash, snippets);
-		}
+        // Cloned snippets
+        for (List<SccSnippetId> cloned: clones) {
+            if (0 == hashIndex%100000000) {
+                SccOutputAnalyzer.printTimeStampedMsg("Creating entry for " + hashIndex + " in snippet-to-files-map.");
+            }
+            List<Snippet> snippets = new ArrayList<Snippet>();
+            int numClones = cloned.size();
+            List<Integer> loc = new ArrayList<Integer>(numClones);
+            for (int i=0; i<numClones; i++) {
+                SccSnippetId id = cloned.get(i);
+                if (id == null) {
+                    SccOutputAnalyzer.printTimeStampedMsg("Skipping null cloned for i = " + i);
+                    continue; 
+                }
+                addSnippet(id, snippets);
+                snippetIdsToAdd.remove(id);
+                loc.add(linesOfCode.get(cloned.get(i)));
+            }
+            int medianLoc = Utils.median(loc, "Different line count for snippet " + Integer.toString(hashIndex));
+            SnippetCode hash = new SnippetCode(medianLoc, Integer.toString(hashIndex++));
+            result.put(hash, snippets);
+        }
 		
-		// Remaining snippets are unique. Add them!
-		for (SccSnippetId id: snippetIdsToAdd) {
-			if (0 == hashIndex%10000) {
-				System.out.println("Creating entry  for " + hashIndex + " in snippet-to-files-map.");
-			}
-			List<Snippet> snippets = new ArrayList<>(1);
-			addSnippet(id, snippets);
-			snippetIdsToAdd.remove(id);
-			int loc = linesOfCode.get(id);
-			SnippetCode hash = new SnippetCode(loc, Integer.toString(hashIndex++));
-			result.put(hash, snippets);
-		}
-		return result;
-	}
-
-	/**
-	 * Add the snippet with the specified SourcererCC snippet id to the list
-	 * specified.
-	 * @param id SourcererCC snippet id of snippet to add
-	 * @param snippets List of snippets, to which the snippet will be added
-	 */
-	private void addSnippet(SccSnippetId id, List<Snippet> snippets) {
-		String notebookName = getNotebookNameFromNumber(notebookNumbers.get(id)); 
-		int snippetIndex = snippetIndices.get(id);
-		snippets.add(new Snippet(notebookName, repros.get(notebookName), snippetIndex));
-	}
+        // Remaining snippets are unique. Add them!
+        for (SccSnippetId id: snippetIdsToAdd) {
+            if (0 == hashIndex%100000000) {
+                printTimeStampedMsg("Creating entry  for " + hashIndex + " in snippet-to-files-map.");
+            }
+            List<Snippet> snippets = new ArrayList<>(1);
+            addSnippet(id, snippets);
+            /// snippetIdsToAdd.remove(id); /// FIXME: this line throws a ConcurrentModificationException
+            int loc = linesOfCode.get(id);
+            SnippetCode hash = new SnippetCode(loc, Integer.toString(hashIndex++));
+            result.put(hash, snippets);
+        }
+        return result;
+    }
+
+    /**
+     * Add the snippet with the specified SourcererCC snippet id to the list
+     * specified.
+     * @param id SourcererCC snippet id of snippet to add
+     * @param snippets List of snippets, to which the snippet will be added
+     */
+    private void addSnippet(SccSnippetId id, List<Snippet> snippets) {
+        String notebookName = getNotebookNameFromNumber(notebookNumbers.get(id)); 
+        int snippetIndex = snippetIndices.get(id);
+        snippets.add(new Snippet(notebookName, repros.get(notebookName), snippetIndex));
+    }
 	
-	private Map<Notebook, SnippetCode[]> getSnippets(Map<SnippetCode, List<Snippet>> snippet2file) {
-		Map<Notebook, SnippetCode[]> result = new HashMap<Notebook, SnippetCode[]>(snippetsPerNotebook.size());
-		// Create arrays for snippets
-		for (String notebookName: snippetsPerNotebook.keySet()) {
-			String repro = repros.get(notebookName);
-			result.put(new Notebook(notebookName, repro), new SnippetCode[snippetsPerNotebook.get(notebookName)]);
+    private Map<Notebook, SnippetCode[]> getSnippets(Map<SnippetCode, List<Snippet>> snippet2file) {
+        Map<Notebook, SnippetCode[]> result = new HashMap<Notebook, SnippetCode[]>(snippetsPerNotebook.size());
+        // Create arrays for snippets
+        for (String notebookName: snippetsPerNotebook.keySet()) {
+            String repro = repros.get(notebookName);
+            result.put(new Notebook(notebookName, repro), new SnippetCode[snippetsPerNotebook.get(notebookName)]);
 			
-		}
-		// Put snippet in notebook-to-snippet-map
-		int numAdded = 0;
-		for (SnippetCode hash: snippet2file.keySet()) {
-			if (0 == numAdded%10000) {
-				System.out.println("Adding snippet " + hash + " to notebook-to-snippet-map.");
-			}
-			for (Snippet snippet: snippet2file.get(hash)) {
-				SnippetCode[] snippetsInFile = result.get(new Notebook(snippet.getFileName()));
-				snippetsInFile[snippet.getSnippetIndex()] = new SnippetCode(hash);
-			}
-			numAdded++;
-		}
-		return result;
-	}
-	
-	private static String getNotebookNameFromNumber(int notebookNumber) {
-		return "nb_" + notebookNumber + ".ipynb";
-	}
+        }
+        // Put snippet in notebook-to-snippet-map
+        int numAdded = 0;
+        for (SnippetCode hash: snippet2file.keySet()) {
+            if (0 == numAdded%100000000) {
+                printTimeStampedMsg("Adding snippet " + hash + " to notebook-to-snippet-map.");
+            }
+            for (Snippet snippet: snippet2file.get(hash)) {
+                SnippetCode[] snippetsInFile = result.get(new Notebook(snippet.getFileName()));
+                snippetsInFile[snippet.getSnippetIndex()] = new SnippetCode(hash);
+            }
+            numAdded++;
+        }
+        return result;
+    }
 	
-	void analyze(String[] args) {
-		String pairFile = null;
-		
-		// Set up
-		for (int i=0; i<args.length; i++) {
-			String arg = args[i];
-			if (arg.startsWith("--stats_file")) {
-				String statsFile = getValueFromArgument(arg);
-				try {
-					initializeSnippetInfo(statsFile);
-				} catch (FileNotFoundException e) {
-					System.err.println("Stats file not found: " + e.getMessage());
-				}
-			} else if (arg.startsWith("--repro_file")) {
-				String reproFile = getValueFromArgument(arg);
-				try {
-					this.initializeReproMap(reproFile);
-				} catch (FileNotFoundException e) {
-					System.err.println("Repro file not found: " + e.getMessage());
-				}
-			} else if (arg.startsWith("--pair_file")) {
-				pairFile = getValueFromArgument(arg);
-			} else if (arg.startsWith("--output_dir")) {
-				outputDir = getValueFromArgument(arg);
-			} else {
-				System.err.println("Unknown argument: " + arg);
-			}
-		}
+    private static String getNotebookNameFromNumber(int notebookNumber) {
+        return "nb_" + notebookNumber + ".ipynb";
+    }
+
+
+    void analyze(String[] args) {
+        String pairFile = null;
+
+        // Set up
+        for (int i=0; i<args.length; i++) {
+            String arg = args[i];
+            if (arg.startsWith("--stats_file")) {
+                String statsFile = getValueFromArgument(arg);
+                try {
+                    printTimeStampedMsg("initializeSnippetInfo start");
+                    registerSnippets(statsFile);
+                    // initializeSnippetInfo(statsFile);
+                    printTimeStampedMsg("initializeSnippetInfo done");
+    
+                } catch (FileNotFoundException e) {
+                    System.err.println("Stats file not found: " + e.getMessage());
+                }
+            } else if (arg.startsWith("--repro_file")) {
+                String reproFile = getValueFromArgument(arg);
+                try {
+                    printTimeStampedMsg("initializeReproMap start");
+                    registerRepos(reproFile);
+                    // initializeReproMap(reproFile);
+                    printTimeStampedMsg("initializeReproMap done");
+    
+                } catch (FileNotFoundException e) {
+                    System.err.println("Repro file not found: " + e.getMessage());
+                }
+            } else if (arg.startsWith("--pair_file")) {
+                pairFile = getValueFromArgument(arg);
+            } else if (arg.startsWith("--output_dir")) {
+                outputDir = getValueFromArgument(arg);
+            } else {
+                System.err.println("Unknown argument: " + arg);
+            }
+        }
 		
-		// Run
-		// (If notebookNumbers is null, none of the snippet info maps are initialized.)
-		if (null != pairFile && "" != pairFile && null != notebookNumbers &&  null !=this.repros) {
-			try {
-				this.clones(pairFile);
-				System.out.println("Clone files created!");
-			} catch (IOException e) {
-				System.err.println("I/O error: " + e.getMessage() + ". Operation interrupted.");
-			}
-		} else {
-			if (null == pairFile || "" == pairFile) {
-				System.err.println("SourcererCC clones pair file path not set!");
-			}
-			if (null == notebookNumbers) {
-				System.err.println("Snippet information is not initialized!");
-			}
-			if (null == this.repros) {
-				System.err.println("Repro information is not initialized!");
-			}
-			System.err.println("Analysis will not be run!");
-		}
-	}
+        // Run
+        // (If notebookNumbers is null, none of the snippet info maps are initialized.)
+        if (null != pairFile && "" != pairFile) { // && null != notebookNumbers &&  null !=this.repros) {
+            try {
+                printTimeStampedMsg("clones start");
+                this.clones(pairFile);
+                printTimeStampedMsg("clones stop");
+                printTimeStampedMsg("Clone files created!");
+            } catch (IOException e) {
+                System.err.println("I/O error: " + e.getMessage() + ". Operation interrupted.");
+            }
+        } else {
+            if (null == pairFile || "" == pairFile) {
+                System.err.println("SourcererCC clones pair file path not set!");
+            }
+            if (null == notebookNumbers) {
+                System.err.println("Snippet information is not initialized!");
+            }
+            if (null == this.repros) {
+                System.err.println("Repro information is not initialized!");
+            }
+            System.err.println("Analysis will not be run!");
+        }
+    }
 	
-	public static void main(String[] args) {
-		SccOutputAnalyzer analyzer = new SccOutputAnalyzer();
-		analyzer.analyze(args);
-	}
+    public static void main(String[] args) {
+        printTimeStampedMsg("Start");
+        SccOutputAnalyzer analyzer = new SccOutputAnalyzer();
+        analyzer.analyze(args);
+        ThreadExecutor.tearDown();
+        printTimeStampedMsg("Stop");
+    }
+}
+
+class CloneGroup {
+    private CloneGroup next;
+    private static long counter = 0;
+
+    private static CloneGroup merge(CloneGroup a, CloneGroup b) {
+        a = a.top();
+        b = b.top();
+
+        if (a == b) {
+            // Already joined
+        } else {
+            a.next = b;
+        }
+
+        return b;
+    }
+
+    public CloneGroup top() {
+        if (this.next == null) {
+            return this;
+        } else {
+            this.next = this.next.top();
+            return this.next;
+        }
+    }
+        
+    public CloneGroup merge(CloneGroup s) {
+        return CloneGroup.merge(this, s);
+    }
+
+    // TODO: optimise
+    // NOTE: Assumes clones is compacted
+    public static List<SccSnippetId> addToList(HashMap<Integer, CloneGroup> clones, CloneGroup c, Map<Integer, SccSnippetId> intToSnippet) {
+        ArrayList<SccSnippetId> result = new ArrayList<SccSnippetId>();
+
+        Set<Integer> keys = clones.keySet();
+        int[] array = new int[keys.size()];
+        int index = 0;
+        for(Integer element : keys) array[index++] = element.intValue();
+        
+        for(int key : array) {
+            CloneGroup value = clones.get(key);
+            if (value == c) {
+                result.add(intToSnippet.get(key));
+                clones.remove(key);
+            }
+        }
+
+        return result;
+    }
+
+    public static List<List<SccSnippetId>> convertResult(HashMap<Integer, CloneGroup> clones,
+                                                         List<SccSnippetId> intToSnippet) {
+            return new ArrayList<List<SccSnippetId>>(invertMap(clones, intToSnippet).values());
+    }
+
+    public static HashMap<CloneGroup, List<SccSnippetId>> invertMap(HashMap<Integer, CloneGroup> clones, List<SccSnippetId> intToSnippet) {
+        // Required for correctness
+        CloneGroup.compact(clones);
+
+        final HashMap<CloneGroup, List<SccSnippetId>> outerResult = new HashMap<CloneGroup, List<SccSnippetId>>();
+
+        final Set<Integer> keySet = clones.keySet();
+        SccOutputAnalyzer.printTimeStampedMsg("inverting map with domain size: " + keySet.size());
+
+        int progress = 0;
+        for (Integer key : keySet) {
+            if (progress++ % 10000 == 0) SccOutputAnalyzer.printTimeStampedMsg("Processed " + progress + " keys");
+
+            CloneGroup cg = clones.get(key);
+            List<SccSnippetId> list = outerResult.get(cg);
+            
+            if (list == null) {
+                list = new ArrayList<SccSnippetId>();
+                list.add(intToSnippet.get(key));
+                outerResult.put(cg, list);
+            } else {
+                list.add(intToSnippet.get(key));
+            }
+        }
+        
+        return outerResult;
+    }
+
+    
+    public static long compact(HashMap<Integer, CloneGroup> clones) {
+        long compaction = 0;
+        
+        for(Map.Entry<Integer, CloneGroup> entry : clones.entrySet()){
+            CloneGroup cs = entry.getValue();
+
+            if (cs.next == null) continue;
+
+            entry.setValue(cs.top());
+            ++compaction;
+        }
+
+        return compaction;
+    }
+    
+    public static void addToCloneList(HashMap<Integer, CloneGroup> clones, String line) {
+        int middleComma = line.indexOf(',', line.indexOf(',') + 1);
+        Integer id1 = null;
+        Integer id2 = null;
+
+        try {
+            id1 = SccSnippetId.getId(line.substring(0, middleComma));
+            id2 = SccSnippetId.getId(line.substring(middleComma + 1));
+            
+        } catch (NumberFormatException nfe) {
+            /// Nothing to do -- happens only once!
+            return;
+        }
+        // Integer id1 = line.substring(0, middleComma).hashCode();
+        // Integer id2 = line.substring(middleComma + 1).hashCode();
+            
+        // assert(line.matches("[0-9]+,[0-9]+,[0-9]+,[0-9]+"));
+
+        if (id1 == null) {
+            System.err.println(id1);
+            System.err.println(id2);
+            System.err.println(line.substring(0, middleComma));  
+            System.err.println(line.substring(middleComma + 1));
+            System.err.println(line);
+        }
+        
+        CloneGroup id1Clones = clones.get(id1);
+        CloneGroup id2Clones = clones.get(id2);
+
+        if (id1Clones == id2Clones) {
+            if (id1Clones != null) {
+                /// We already had them marked as clones
+            } else {
+                /// Create a new clone set with this data
+                CloneGroup top = new CloneGroup();
+                clones.put(id1, top);
+                clones.put(id2, top);
+            }
+        } else {
+            /// Merge the sets as they are both clones, and point both to same set
+            if (id1Clones == null) {
+                clones.put(id1, id2Clones.top());
+            } else if (id2Clones == null) {
+                clones.put(id2, id1Clones.top());
+            } else {
+                CloneGroup top = id1Clones.merge(id2Clones);
+                if (id1Clones != top) clones.put(id1, top);
+                if (id2Clones != top) clones.put(id2, top);
+            }
+        }
+    }
 }
+
+
diff --git a/Programs/src/notebooks/SccSnippetId.java b/Programs/src/notebooks/SccSnippetId.java
index 7d197bb..eb219c1 100644
--- a/Programs/src/notebooks/SccSnippetId.java
+++ b/Programs/src/notebooks/SccSnippetId.java
@@ -1,23 +1,112 @@
 package notebooks;
 
 import java.util.Objects;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ConcurrentLinkedQueue;
 
 public class SccSnippetId {
-	private final int nbID;
-	private final int snippetID;
-	
+    public final int nbID;
+    public final int snippetID;
+
+    public int linesOfCode = 0;
+    
+    public static final java.util.HashMap<String, Integer> resolver = new java.util.HashMap<String, Integer>();
+    public static final java.util.ArrayList<SccSnippetId> directory = new java.util.ArrayList<SccSnippetId>();
+    
 	public SccSnippetId(int nbID, int snippetID) {
-		this.nbID = nbID;
-		this.snippetID = snippetID;
+      this.nbID = nbID;
+      this.snippetID = snippetID;
+	}
+
+    private SccSnippetId(int nbID, int snippetID, int linesOfCode) {
+        this(nbID, snippetID);
+        this.linesOfCode = linesOfCode;
+    }
+
+	public SccSnippetId(final String sid) throws NumberFormatException {
+      final int commaPos = sid.indexOf(',');
+
+      final String id1 = sid.substring(0, commaPos);
+      final String id2 = sid.substring(commaPos + 1);
+
+      this.nbID = Integer.parseInt(id1);
+      this.snippetID = Integer.parseInt(id2);
+	}
+    
+    public SccSnippetId(final String sid, char separator) throws NumberFormatException {
+      final int commaPos = sid.indexOf(separator);
+
+      final String id1 = sid.substring(0, commaPos);
+      final String id2 = sid.substring(commaPos + 1);
+
+      this.nbID = Integer.parseInt(id1);
+      this.snippetID = Integer.parseInt(id2);
 	}
 
+    public NotebookFile getNotebook() {
+        return NotebookFile.getById(this.nbID);
+    }
+
+    public boolean isEmpty() {
+        return this.linesOfCode == 0; // Confirm with Malin what the correct definition is 
+    }
+    
+    // static int create(int a, int b) throws NumberFormatException {
+    //     return create(a + "," + b);
+    // }
+    
+    // static int create(String sid) throws NumberFormatException {
+    //     int sidHash = sid.hashCode();
+    //     if (directory.containsKey(sidHash) == false) {
+    //         directory.put(sidHash, new SccSnippetId(sid));
+    //     }
+    //     return sidHash;
+    // }
+
+    static SccSnippetId getByPair(String sidPair) throws NumberFormatException {
+        final int at = sidPair.indexOf('@');
+        final String id = sidPair.substring(0, at) + "," + sidPair.substring(at + 1);
+        return directory.get(resolver.get(id));
+    }
+
+    static SccSnippetId getByCommaSeparatedPair(String id) throws NumberFormatException {
+        return directory.get(resolver.get(id));
+    }
+    
+    static Integer getId(String id) throws NumberFormatException {
+        return resolver.get(id);
+    }
+    
+    public static void register(String sid, int nbIB, int snippetID, int linesOfCode) {
+        directory.add(new SccSnippetId(nbIB, snippetID, linesOfCode));
+        resolver.put(sid, directory.size());
+    }
+    
+    public static void register(String info) {
+        final int firstComma = info.indexOf(',');
+        final int secondComma = info.indexOf(',', firstComma + 1);
+        final int ultimateComma = info.lastIndexOf(',');
+        final int penUltimateComma = info.lastIndexOf(',', ultimateComma - 1);
+
+        final int nbID = Integer.parseInt(info.substring(0, firstComma));
+        final int snippetID = Integer.parseInt(info.substring(firstComma + 1, secondComma));
+        final int nbNumber = Integer.parseInt(info.substring(info.indexOf('_') + 1,
+                                                             info.indexOf('.')));
+        final int linesOfCode = Integer.parseInt(info.substring(penUltimateComma + 1, ultimateComma));
+        
+        SccSnippetId.register(info.substring(0, secondComma), nbID, snippetID, linesOfCode);
+        NotebookFile.register(nbID, nbNumber); 
+    }
+    
 	@Override
 	public boolean equals(Object other) {
-		if (other.getClass() != this.getClass()) {
+		if (other instanceof SccSnippetId) {
+        SccSnippetId otherId = (SccSnippetId)other;
+        return this.nbID == otherId.nbID && this.snippetID == otherId.snippetID;
+		} else {
 			return false;
-		}
-		SccSnippetId otherId = (SccSnippetId)other;
-		return this.nbID == otherId.nbID && this.snippetID == otherId.snippetID;
+    }
 	}
 	
 	@Override
diff --git a/Programs/src/notebooks/ThreadExecutor.java b/Programs/src/notebooks/ThreadExecutor.java
index 699c8d1..be3f9a7 100644
--- a/Programs/src/notebooks/ThreadExecutor.java
+++ b/Programs/src/notebooks/ThreadExecutor.java
@@ -13,12 +13,12 @@
  */
 public class ThreadExecutor {
 	private static ThreadExecutor instance;
-	private ExecutorService threadPool;
+	public final  ExecutorService threadPool;
 	
 	private ThreadExecutor() {
 		int cores = Runtime.getRuntime().availableProcessors();
-		System.out.println("Setting up a thread pool with " + (2*cores) + " threads.");
-		threadPool = Executors.newFixedThreadPool(2*cores);
+		System.out.println("Setting up a thread pool with " + (cores/2) + " threads.");
+		threadPool = Executors.newFixedThreadPool(cores/2);
 	}
 	
 	/**
diff --git a/Programs/src/notebooks/Utils.java b/Programs/src/notebooks/Utils.java
index c1d9a80..a9069b8 100644
--- a/Programs/src/notebooks/Utils.java
+++ b/Programs/src/notebooks/Utils.java
@@ -12,6 +12,7 @@ public class Utils {
 	 * @return median of values
 	 */
 	public static int median(List<Integer> values, String msg) {
+      if (values.size() == 0) return 0; // FIXME: defensive but should it be offensive?
 		Collections.sort(values);
 		int min = values.get(0);
 		int max = values.get(values.size()-1);