From ae0fa98a63fa4fe81e6b25d80c07560d1d330d76 Mon Sep 17 00:00:00 2001 From: Tobias Wrigstad Date: Wed, 22 Apr 2020 22:50:00 +0200 Subject: [PATCH 1/3] Optimised clone list creation --- Programs/src/notebooks/SccOutputAnalyzer.java | 93 ++++++++++++++++++- Programs/src/notebooks/SccSnippetId.java | 4 + 2 files changed, 93 insertions(+), 4 deletions(-) diff --git a/Programs/src/notebooks/SccOutputAnalyzer.java b/Programs/src/notebooks/SccOutputAnalyzer.java index 2d56bf1..0f479fd 100644 --- a/Programs/src/notebooks/SccOutputAnalyzer.java +++ b/Programs/src/notebooks/SccOutputAnalyzer.java @@ -5,6 +5,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -132,7 +133,22 @@ private void addOrIncrease(Map map, String key) { * files from SourcererCC. */ private Map> getClones(String pairFile) throws IOException { - List> clones = getCloneLists(pairFile); + // List> clones = getCloneLists(pairFile); + // List> clones2 = getCloneListsAlt(pairFile); + + // HashSet> a = new HashSet>(); + // HashSet> b = new HashSet>(); + + // for (List il : clones) { + // a.add(new HashSet(il)); + // } + // for (List il : clones2) { + // b.add(new HashSet(il)); + // } + + // System.err.println("ALTERNATIVE " + a.equals(b)); + + List> clones = getCloneListsAlt(pairFile); return getCloneMap(clones); } @@ -140,6 +156,7 @@ private List> getCloneLists(String pairFile) throws FileNotFo List> clones = new ArrayList>(); Scanner scanner = new Scanner(new File(pairFile)); int numRead = 0; + long t1 = System.currentTimeMillis(); while (scanner.hasNextLine()) { String line = scanner.nextLine(); assert(line.matches("[0-9]+,[0-9]+,[0-9]+,[0-9]+")); @@ -169,15 +186,83 @@ private List> getCloneLists(String pairFile) throws FileNotFo clones.add(newCloneList); } numRead++; - if (0 == numRead%1000000) { - System.out.println(numRead + " clone pairs read."); + if (0 == numRead%10000) { + long t2 = System.currentTimeMillis(); + System.out.println(numRead + " clone pairs read in " + (t2 - t1) + " ms"); + t1 = t2; } } scanner.close(); return clones; } + + private List> getCloneListsAlt(String pairFile) throws FileNotFoundException { + Scanner scanner = new Scanner(new File(pairFile)); + int numRead = 0; + + HashMap> clones = new HashMap>(); + + long t1 = System.currentTimeMillis(); + + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + assert(line.matches("[0-9]+,[0-9]+,[0-9]+,[0-9]+")); + String[] numbers = line.split(","); + SccSnippetId id1 = new SccSnippetId(Integer.parseInt(numbers[0]), Integer.parseInt(numbers[1])); + SccSnippetId id2 = new SccSnippetId(Integer.parseInt(numbers[2]), Integer.parseInt(numbers[3])); + + HashSet id1Clones = clones.get(id1); + HashSet id2Clones = clones.get(id2); + + if (id1Clones == id2Clones) { + if (id1Clones != null) { + /// We already had them marked as clones + } else { + /// Create a new clone set with this data + HashSet newSet = new HashSet(); + newSet.add(id1); + newSet.add(id2); + clones.put(id1, newSet); + clones.put(id2, newSet); + } + } else { + /// Merge the sets as they are both clones, and point both to same set + if (id1Clones == null) { + id2Clones.add(id1); + clones.put(id1, id2Clones); + } else if (id2Clones == null) { + id1Clones.add(id2); + clones.put(id2, id1Clones); + } else { + id1Clones.addAll(id2Clones); + clones.put(id2, id1Clones); + } + } + + numRead++; + if (0 == numRead%10000) { + long t2 = System.currentTimeMillis(); + System.out.println(numRead + " clone pairs read in " + (t2 - t1) + " ms"); + t1 = t2; + } + } + + List> result = new ArrayList>(); + + HashSet> alreadyIncluded = new HashSet>(); + for(HashSet list : clones.values()) { + if (alreadyIncluded.contains(list) == false) { + result.add(new ArrayList(list)); + alreadyIncluded.add(list); + } + } + + scanner.close(); + return result; + } - private Map> getCloneMap(List> clones) + + private Map> getCloneMap(List> clones) throws FileNotFoundException { Map> result = new HashMap>(clones.size()); Set snippetIdsToAdd = notebookNumbers.keySet(); diff --git a/Programs/src/notebooks/SccSnippetId.java b/Programs/src/notebooks/SccSnippetId.java index 4468232..7d197bb 100644 --- a/Programs/src/notebooks/SccSnippetId.java +++ b/Programs/src/notebooks/SccSnippetId.java @@ -24,4 +24,8 @@ public boolean equals(Object other) { public int hashCode() { return Objects.hash(nbID, snippetID); } + + public String toString() { + return this.nbID + "@" + this.snippetID; + } } From d9f15db1a527f6ba4d17802e15463168d5794e1a Mon Sep 17 00:00:00 2001 From: malinkallen Date: Tue, 16 Jun 2020 16:41:07 +0200 Subject: [PATCH 2/3] Verify that the optimization of getCloneLists doesn't produce duplicated clone groups. --- Programs/test/data/scc/clone_pairs_clopt | 6 +++++ Programs/test/data/scc/file_stats_clopt | 4 +++ .../test/notebooks/SccOutputAnalyzerTest.java | 27 +++++++++++++++++-- 3 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 Programs/test/data/scc/clone_pairs_clopt create mode 100644 Programs/test/data/scc/file_stats_clopt diff --git a/Programs/test/data/scc/clone_pairs_clopt b/Programs/test/data/scc/clone_pairs_clopt new file mode 100644 index 0000000..4f80e63 --- /dev/null +++ b/Programs/test/data/scc/clone_pairs_clopt @@ -0,0 +1,6 @@ +1,1,1,2 +1,3,1,4 +1,1,1,3 +1,4,1,1 +1,2,1,3 +1,2,1,4 diff --git a/Programs/test/data/scc/file_stats_clopt b/Programs/test/data/scc/file_stats_clopt new file mode 100644 index 0000000..6941461 --- /dev/null +++ b/Programs/test/data/scc/file_stats_clopt @@ -0,0 +1,4 @@ +1,1,"/path/to/nb_1.zip/nb_1_0.py","NULL/.py","abc",X,X,X,10 +1,2,"/path/to/nb_2.zip/nb_2_0.py","NULL/.py","def",X,X,X,10 +1,3,"/path/to/nb_2.zip/nb_2_1.py","NULL/.py","ghi",X,X,X,10 +1,4,"/path/to/nb_1.zip/nb_1_1.py","NULL/.py","jkl",X,X,X,10 diff --git a/Programs/test/notebooks/SccOutputAnalyzerTest.java b/Programs/test/notebooks/SccOutputAnalyzerTest.java index d6e39e4..d7d0667 100644 --- a/Programs/test/notebooks/SccOutputAnalyzerTest.java +++ b/Programs/test/notebooks/SccOutputAnalyzerTest.java @@ -7,6 +7,7 @@ public class SccOutputAnalyzerTest { private SccOutputAnalyzer analyzer; + private final static String hashPattern = "[0-9,a-f]+"; private final static String notebookNamePattern = "nb_[0-9]+\\.ipynb"; @Before @@ -241,7 +242,7 @@ public void testLocComputation_odd() throws IOException { String[] expectedLines = { hash2filesHeader(), - "[0-9,a-f]+, 13, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+" + hashPattern + ", 13, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+" }; analyzer.clones(statsFile, reproMap, pairFile); @@ -263,7 +264,7 @@ public void testLocComputation_even() throws IOException { String[] expectedLines = { hash2filesHeader(), - "[0-9,a-f]+, 16, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, nb_[0-9]\\.ipynb, [0-9]" + hashPattern + ", 16, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+" }; analyzer.clones(statsFile, reproMap, pairFile); @@ -271,6 +272,28 @@ public void testLocComputation_even() throws IOException { TestUtils.deleteCloneCsvs(); } + /** + * Verify that a clone group is only considered once after the optimization + * of getCloneLists. + * @throws IOException + */ + @Test + public void testCloneListsOptimization() throws IOException { + String dataDir = "test/data/scc"; + String statsFile = dataDir + "/file_stats_clopt"; + String pairFile = dataDir + "/clone_pairs_clopt"; + String reproFile = "test/data/hash/repros.csv"; + + String[] expectedLines = { + hash2filesHeader(), + hashPattern + ", 10, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+, " + notebookNamePattern + ", [0-9]+" + }; + + analyzer.clones(statsFile, reproFile, pairFile); + TestUtils.checkCsv_matches("hash2filesA", expectedLines); + TestUtils.deleteCloneCsvs(); + } + /** * Verify that an AssertionError is thrown when the clone pairs file is on * the wrong format. From fa18651efef7f8b1f7257520c037b9a77b44d25d Mon Sep 17 00:00:00 2001 From: Tobias Wrigstad Date: Mon, 25 May 2020 00:48:38 +0200 Subject: [PATCH 3/3] Still in dire need of refactoring --- Programs/build.xml | 2 +- Programs/runner.sh | 1 + Programs/src/notebooks/Analyzer.java | 56 +- Programs/src/notebooks/CloneFileWriter.java | 74 +- Programs/src/notebooks/NotebookFile.java | 137 +++ Programs/src/notebooks/Repository.java | 74 ++ Programs/src/notebooks/SccOutputAnalyzer.java | 991 +++++++++++------- Programs/src/notebooks/SccSnippetId.java | 107 +- Programs/src/notebooks/ThreadExecutor.java | 6 +- Programs/src/notebooks/Utils.java | 1 + 10 files changed, 1028 insertions(+), 421 deletions(-) create mode 100644 Programs/runner.sh create mode 100644 Programs/src/notebooks/NotebookFile.java create mode 100644 Programs/src/notebooks/Repository.java diff --git a/Programs/build.xml b/Programs/build.xml index 7529504..e8bf72b 100644 --- a/Programs/build.xml +++ b/Programs/build.xml @@ -27,7 +27,7 @@ - + diff --git a/Programs/runner.sh b/Programs/runner.sh new file mode 100644 index 0000000..f3b6ccd --- /dev/null +++ b/Programs/runner.sh @@ -0,0 +1 @@ +java -ea -XX:+UseParallelGC -cp bin:external/json-simple-1.1.1.jar -Xms6G -Xmx20G notebooks.SccOutputAnalyzer --repro_file=/home/maka4186/notebook_disk/notebook-number_repo.csv --stats_file=/home/maka4186/notebook_disk/SourcererCC_output/files.stats --pair_file=/home/maka4186/notebook_disk/SourcererCC_output/clone.pairs.only.numbers --output_dir=OutputSCC2 diff --git a/Programs/src/notebooks/Analyzer.java b/Programs/src/notebooks/Analyzer.java index 482c5bd..d8f7f42 100644 --- a/Programs/src/notebooks/Analyzer.java +++ b/Programs/src/notebooks/Analyzer.java @@ -4,7 +4,7 @@ import java.io.FileNotFoundException; import java.util.HashMap; import java.util.Map; -import java.util.Scanner; +import java.io.*; public class Analyzer { protected String outputDir = "."; @@ -14,26 +14,38 @@ public class Analyzer { * @param fileName Name of file with mapping from notebook number to repro * @return The map from notebook name to repro */ - protected static Map createReproMap(String fileName) - throws FileNotFoundException { - Map result = new HashMap(); - Scanner scanner = new Scanner(new File(fileName)); - while (scanner.hasNextLine()) { - String line = scanner.nextLine(); - String[] subStrings = line.split(","); - try { - int notebookNumber = Integer.parseInt(subStrings[0]); - String notebookName = "nb_" + notebookNumber + ".ipynb"; - String reproName = subStrings[1]; - result.put(notebookName, reproName); - } catch (NumberFormatException e) { - System.err.println("Notebook numbers in repro file must be integers! Notebook with \"number\" '" - + subStrings[0] + "' is excluded from mapping!"); - } - } - scanner.close(); - return result; - } + protected static Map createReproMap(String fileName) + throws FileNotFoundException { + Map result = new HashMap(); + try { + BufferedReader input = new BufferedReader(new FileReader(new File(fileName))); + + while (true) { + final String line = input.readLine(); + if (line == null) break; + + String[] subStrings = line.split(","); + try { + int notebookNumber = Integer.parseInt(subStrings[0]); + StringBuilder sb = new StringBuilder(); + sb.append("nb_"); + sb.append(notebookNumber); + sb.append(".ipynb"); + String notebookName = sb.toString(); + String reproName = subStrings[1]; + result.put(notebookName, reproName); + } catch (NumberFormatException e) { + System.err.println("Notebook numbers in repro file must be integers! Notebook with \"number\" '" + + subStrings[0] + "' is excluded from mapping!"); + } + } + + input.close(); + } catch (IOException e) { + e.printStackTrace(System.err); + } + return result; + } /** * Get the part of arg located after the (first) '=' sign. If the '=' is @@ -48,4 +60,4 @@ protected String getValueFromArgument(String arg) { return arg.substring(eqIndex + 1); } } -} \ No newline at end of file +} diff --git a/Programs/src/notebooks/CloneFileWriter.java b/Programs/src/notebooks/CloneFileWriter.java index 2bbb263..ddc65f2 100644 --- a/Programs/src/notebooks/CloneFileWriter.java +++ b/Programs/src/notebooks/CloneFileWriter.java @@ -14,6 +14,7 @@ import java.util.concurrent.Future; public class CloneFileWriter { + private LocalDateTime startTime = LocalDateTime.now(); private String outputDir; public CloneFileWriter(String outputDir) { @@ -50,7 +51,7 @@ public void write(Map file2hashes, } private void printFile2hashes(Map file2hashes) throws IOException { - Writer writer = new FileWriter(outputDir + "/file2hashesA" + LocalDateTime.now() + ".csv"); + Writer writer = new FileWriter(outputDir + "/file2hashesA" + startTime + ".csv"); writer.write(file2hashesHeader()); for (Notebook notebook: file2hashes.keySet()) { writer.write(notebook.getName()); @@ -64,7 +65,7 @@ private void printFile2hashes(Map file2hashes) throws I } private void printHash2files(Map> hash2files) throws IOException { - Writer writer = new FileWriter(outputDir + "/hash2filesA" + LocalDateTime.now() + ".csv"); + Writer writer = new FileWriter(outputDir + "/hash2filesA" + startTime + ".csv"); writer.write(hash2filesHeader()); for (SnippetCode code: hash2files.keySet()) { writer.write(code.getHash() + ", " + code.getLOC()); @@ -78,7 +79,7 @@ private void printHash2files(Map> hash2files) throws private void printCloneFrequencies(Map file2hashes, Map> hash2files) throws IOException { - Writer writer = new FileWriter(outputDir + "/cloneFrequency" + LocalDateTime.now() + ".csv"); + Writer writer = new FileWriter(outputDir + "/cloneFrequency" + startTime + ".csv"); writer.write(cloneFrequencyHeader()); for (Notebook notebook: file2hashes.keySet()) { int numClones = 0, numUnique = 0, numClonesNE = 0; @@ -146,32 +147,47 @@ private void printCloneFrequencies(Map file2hashes, * @param hash2files Mapping from snippets to position in notebooks * @param NUM_NOTEBOOKS Maximum number of notebooks to print connection information for */ - private void printConnectionsFile(Map file2hashes, - Map> hash2files, final int NUM_CONNECTIONS) throws IOException { - Writer writer = new FileWriter(outputDir + "/connections" + LocalDateTime.now() + ".csv"); - writer.write(connectionsHeader()); - List notebooks = new ArrayList(file2hashes.keySet()); - Collections.shuffle(notebooks); - int connectionsToPrint = Math.min(NUM_CONNECTIONS, file2hashes.size()); - List> tasks = new ArrayList>(connectionsToPrint); - for (int i=0; i> result = ThreadExecutor.getInstance().invokeAll(tasks); - for (int i=0; i file2hashes, + final Map> hash2files, + final int NUM_CONNECTIONS) throws IOException { + + Writer writer = new FileWriter(outputDir + "/connections" + startTime + ".csv"); + writer.write(connectionsHeader()); + final List notebooks = new ArrayList(file2hashes.keySet()); + ///Collections.shuffle(notebooks); + final int connectionsToPrint = Math.min(NUM_CONNECTIONS, file2hashes.size()); + + // List> tasks = new ArrayList>(connectionsToPrint); + // for (int i=0; i> tasks = new ArrayList>(connectionsToPrint); + for (int i=0; i < 8; i++) { + boolean heartBeat = true; //0 == i%100000000; + final int start = i; + tasks.add(() -> { + for(int j = start; j < connectionsToPrint; j += 8) { + new ConnectionsLineBuilder(notebooks.get(j), file2hashes, hash2files, heartBeat).call(); + } + return null; + }); + } + List> result = ThreadExecutor.getInstance().invokeAll(tasks); + // for (int i=0; i f) { + for (NotebookFile nb : directory) { + if (nb != null) { + f.accept(nb); + } + } + } + + public static NotebookFile getById(int id) { + assert (0 <= id && id < NotebookFile.directory.length) : "Illegal repo id " + id; + assert (NotebookFile.directory[id] != null) : "Tried to lookup non-existing NotebookFile (id=" + id + ")"; + + return NotebookFile.directory[id]; + } + + private final int id; + /// Connections between snippets where inter/intra denotes different/same notebooks + private int intraConnections = 0; + private int intraEmptyConnections = 0; + private int interConnections = 0; + private int interEmptyConnections = 0; + /// Connections between snippets where inter/intra denotes different/same repo + private int intraRepoConnections = 0; + private int intraRepoEmptyConnections = 0; + private int interRepoConnections = 0; + private int interRepoEmptyConnections = 0; + + public Repository getRepo() { + return Repository.getByNotebookNumber(this.id); + } + + public NotebookFile(int id) { + this.id = id; + } + + public void addIntraConnections(boolean empty) { + addIntraConnections(1, empty); + } + + public void addInterConnections(boolean empty) { + addInterConnections(1, empty); + } + + public void addIntraConnections(int value, boolean empty) { + if (empty) { + intraEmptyConnections += value; + } else { + intraConnections += value; + } + } + + public void addInterConnections(int value, boolean empty) { + if (empty) { + interEmptyConnections += value; + } else { + interConnections += value; + } + } + + public int intraConnections() { + return intraConnections; + } + + public int intraEmptyConnections() { + return intraEmptyConnections; + } + + public int interConnections() { + return interConnections; + } + + public int interEmptyConnections() { + return interEmptyConnections; + } + + public void addIntraRepoConnections(boolean empty) { + addIntraRepoConnections(1, empty); + } + + public void addInterRepoConnections(boolean empty) { + addInterRepoConnections(1, empty); + } + + public void addIntraRepoConnections(int value, boolean empty) { + if (empty) { + intraRepoEmptyConnections += value; + } else { + intraRepoConnections += value; + } + } + + public void addInterRepoConnections(int value, boolean empty) { + if (empty) { + interRepoEmptyConnections += value; + } else { + interRepoConnections += value; + } + } + + public int intraRepoConnections() { + return intraRepoConnections; + } + + public int intraRepoEmptyConnections() { + return intraRepoEmptyConnections; + } + + public int interRepoConnections() { + return interRepoConnections; + } + + public int interRepoEmptyConnections() { + return interRepoEmptyConnections; + } + + public String fileName() { + return new StringBuilder() + .append("nb_") + .append(this.id) + .append(".ipynb") + .toString(); + } +} diff --git a/Programs/src/notebooks/Repository.java b/Programs/src/notebooks/Repository.java new file mode 100644 index 0000000..6eb4e3c --- /dev/null +++ b/Programs/src/notebooks/Repository.java @@ -0,0 +1,74 @@ +package notebooks; + +import java.util.HashMap; + +public class Repository { + private static HashMap urlRepo = new HashMap(); + private static HashMap notebookRepo = new HashMap(); + + public static void register(int nb, String url) { + Repository r = getByURL(url); + + if (r == null) { + r = new Repository(); /// Todo: should we save url? + Repository.urlRepo.put(url.hashCode(), r); + } + + Repository.notebookRepo.put(nb, r); + } + + public static Repository getByNotebookNumber(int nb) { + return Repository.notebookRepo.get(nb); + } + + public static Repository getByURL(String url) { + return Repository.urlRepo.get(url.hashCode()); + } + + private Repository() {} + + private int intraConnections = 0; + private int intraEmptyConnections = 0; + private int interConnections = 0; + private int interEmptyConnections = 0; + + public int intraConnections() { + return intraConnections; + } + + public int intraEmptyConnections() { + return intraEmptyConnections; + } + + public int interConnections() { + return interConnections; + } + + public int interEmptyConnections() { + return interEmptyConnections; + } + + public void addIntraConnections(boolean empty) { + addIntraConnections(1, empty); + } + + public void addInterConnections(boolean empty) { + addInterConnections(1, empty); + } + + public void addIntraConnections(int value, boolean empty) { + if (empty) { + ++intraEmptyConnections; + } else { + ++intraConnections; + } + } + + public void addInterConnections(int value, boolean empty) { + if (empty) { + interEmptyConnections += value; + } else { + interConnections += value; + } + } +} diff --git a/Programs/src/notebooks/SccOutputAnalyzer.java b/Programs/src/notebooks/SccOutputAnalyzer.java index 0f479fd..28913d1 100644 --- a/Programs/src/notebooks/SccOutputAnalyzer.java +++ b/Programs/src/notebooks/SccOutputAnalyzer.java @@ -10,391 +10,668 @@ import java.util.List; import java.util.Map; import java.util.Scanner; +import java.io.*; import java.util.Set; +import java.util.Locale; public class SccOutputAnalyzer extends Analyzer { - // Information about each snippet - Map notebookNumbers; - Map snippetIndices; - Map linesOfCode; - // Information about each notebook - private Map repros = null; - Map snippetsPerNotebook = null; - - /** - * Perform the clone analysis based on SourcererCC output files. Write - * file2hashesA.csv, hash2filesA.csv, - * cloneFrequencies.csv and - * connections.csv accordingly. - * This methods initializes snippet and repro information, so you shouldn't - * do it explicitly before the call to this method. - * Note that the ''hashes'' written by this method are not the MD5 hashes - * of the snippets, but just the value of a counter. However, all instances - * of the ''hash'' of a snippet are the same. - * @param statsFile Path to file stats file produced by the SourcererCC tokenizer - * @param reproFile Path to file with mapping from notebook number to repro - * @param pairFile: Path to output file with clone pairs from the SourcererCC clone detection - * @return A map from snippets to files - * @throws IOException - */ - public Map> clones(String statsFile, String reproFile, String pairFile) throws IOException { - initializeSnippetInfo(statsFile); - initializeReproMap(reproFile); - return clones(pairFile); - } + private static final long startTimeStamp = System.currentTimeMillis(); + public static void printTimeStampedMsg(String msg) { + StringBuilder sb = new StringBuilder(); + sb.append(">>> ("); + sb.append((System.currentTimeMillis() - startTimeStamp) / 1000); + sb.append(" sec) "); + sb.append(msg); + System.err.println(sb.toString()); + } + + // Information about each snippet + Map notebookNumbers; + Map snippetIndices; + Map linesOfCode; + // Information about each notebook + private Map repros = null; + Map snippetsPerNotebook = null; - /** - * Perform the clone analysis based on SourcererCC output files. Write - * file2hashesA.csv, hash2filesA.csv, - * cloneFrequencies.csv and - * connections.csv accordingly. - * Note that you have to initialize the snippet and repro information, by - * calling initializeSnippetInfo and initializeReproMap respectively before - * calling this method! - * Note that the ''hashes'' written by this method are not the MD5 hashes - * of the snippets, but just the value of a counter. However, all instances - * of the ''hash'' of a snippet are the same. - * @param pairFile: Path to output file with clone pairs from the SourcererCC clone detection - * @return A map from snippets to files - * @throws IOException - */ - public Map> clones(String pairFile) throws IOException { - System.out.println("Analyzing clones based on SourcererCC output files!"); - System.out.println("NOTE THAT NOTEBOOKS WITHOUT SNIPPETS ARE NOT INCLUDED"); - System.out.println("since they are not included in the SourcererCC data!"); - Map> snippet2file = getClones(pairFile); - Map file2snippet = getSnippets(snippet2file); - new CloneFileWriter(outputDir).write(file2snippet, snippet2file); - return snippet2file; - } + /** + * Perform the clone analysis based on SourcererCC output files. Write + * file2hashesA.csv, hash2filesA.csv, + * cloneFrequencies.csv and + * connections.csv accordingly. + * This methods initializes snippet and repro information, so you shouldn't + * do it explicitly before the call to this method. + * Note that the ''hashes'' written by this method are not the MD5 hashes + * of the snippets, but just the value of a counter. However, all instances + * of the ''hash'' of a snippet are the same. + * @param statsFile Path to file stats file produced by the SourcererCC tokenizer + * @param reproFile Path to file with mapping from notebook number to repro + * @param pairFile: Path to output file with clone pairs from the SourcererCC clone detection + * @return A map from snippets to files + * @throws IOException + */ + public Map> clones(String statsFile, String reproFile, String pairFile) throws IOException { + // initializeSnippetInfo(statsFile); + // initializeReproMap(reproFile); + registerRepos(reproFile); + registerSnippets(statsFile); + return clones(pairFile); + } - /** - * Initialize repro information for each notebook. - * @param fileName Path to file with mapping from notebook number to repro - */ - public void initializeReproMap(String fileName) throws FileNotFoundException { - repros = createReproMap(fileName); - } + /** + * Perform the clone analysis based on SourcererCC output files. Write + * file2hashesA.csv, hash2filesA.csv, + * cloneFrequencies.csv and + * connections.csv accordingly. + * Note that you have to initialize the snippet and repro information, by + * calling initializeSnippetInfo and initializeReproMap respectively before + * calling this method! + * Note that the ''hashes'' written by this method are not the MD5 hashes + * of the snippets, but just the value of a counter. However, all instances + * of the ''hash'' of a snippet are the same. + * @param pairFile: Path to output file with clone pairs from the SourcererCC clone detection + * @return A map from snippets to files + * @throws IOException + */ + public Map> clones(String pairFile) throws IOException { + System.out.println("Analyzing clones based on SourcererCC output files!"); + System.out.println("NOTE THAT NOTEBOOKS WITHOUT SNIPPETS ARE NOT INCLUDED"); + System.out.println("since they are not included in the SourcererCC data!"); + Map> snippet2file = getClones(pairFile); + Map file2snippet = getSnippets(snippet2file); + new CloneFileWriter(outputDir).write(file2snippet, snippet2file); + return snippet2file; + } - /** - * Initialize the maps containing information about each snippet - * @param statsFile Path to file stats file produced by the SourcererCC tokenizer - * @throws FileNotFoundException If the stats file doesn't exist - */ - public void initializeSnippetInfo(String statsFile) throws FileNotFoundException { - Scanner statsScanner = new Scanner(new File(statsFile)); - notebookNumbers = new HashMap(); - snippetIndices = new HashMap(); - linesOfCode = new HashMap(); - snippetsPerNotebook = new HashMap(); - while(statsScanner.hasNextLine()) { - String line = statsScanner.nextLine(); - String[] columns = line.split(","); - int id1 = Integer.parseInt(columns[0]); - int id2 = Integer.parseInt(columns[1]); - SccSnippetId id = new SccSnippetId(id1, id2); - String path = columns[2]; - // Remove directories from filename - String snippetFileName = path.substring(path.lastIndexOf('/') + 1); - // Remove suffix - snippetFileName = snippetFileName.substring(0, snippetFileName.lastIndexOf('.')); - String[] snippetSubStrings = snippetFileName.split("_"); - int notebookNumber = Integer.parseInt(snippetSubStrings[1]); - String notebookName = getNotebookNameFromNumber(notebookNumber); - addOrIncrease(snippetsPerNotebook, notebookName); - notebookNumbers.put(id, notebookNumber); - snippetIndices.put(id, Integer.parseInt(snippetSubStrings[2])); - /* Here we use the number of lines of source code (comments - excluded), which is inconsistent with the clone analysis of the - notebook files, but so is the clone detection -SourcererCC - doesn't consider comments in clone analysis. */ - int loc = Integer.parseInt(columns[8]); - linesOfCode.put(id, loc); - } - statsScanner.close(); - } + /** + * Initialize repro information for each notebook. + * @param fileName Path to file with mapping from notebook number to repro + */ + public void initializeReproMap(String fileName) throws FileNotFoundException { + repros = createReproMap(fileName); + } + + public void registerRepos(String fileName) throws FileNotFoundException { + try (BufferedReader f = new BufferedReader(new FileReader(new File(fileName)))) { + for(String line = f.readLine(); line != null; line = f.readLine()) { + Repository.register(Integer.parseInt(line.substring(0, line.indexOf(','))), // Notebook ID + line.substring(line.indexOf(',') + 1)); // URL + } + + } catch (IOException e) { + e.printStackTrace(System.err); + } + } - /** - * If map contains a value for key, increase it with 1. Else add an entry - * with for key with the value 1. - * @param map Map to modify as stated above - * @param key Key for the entry that will be changed/added - */ - private void addOrIncrease(Map map, String key) { - if (map.containsKey(key)) { - map.put(key, map.get(key) + 1); - } else { - map.put(key, 1); - } - } - - /** - * Create a mapping from snippets to notebooks (hash2files) using output - * files from SourcererCC. - */ - private Map> getClones(String pairFile) throws IOException { - // List> clones = getCloneLists(pairFile); - // List> clones2 = getCloneListsAlt(pairFile); - - // HashSet> a = new HashSet>(); - // HashSet> b = new HashSet>(); + public void registerSnippets(String statsFile) throws FileNotFoundException { + try (BufferedReader f = new BufferedReader(new FileReader(new File(statsFile)))) { + for(String line = f.readLine(); line != null; line = f.readLine()) { + SccSnippetId.register(line); + } + + } catch (IOException e) { + e.printStackTrace(System.err); + } + } + + public void initializeSnippetInfo(String statsFile) throws FileNotFoundException { + BufferedReader input = new BufferedReader(new FileReader(new File(statsFile))); + notebookNumbers = new HashMap(); + snippetIndices = new HashMap(); + linesOfCode = new HashMap(); + snippetsPerNotebook = new HashMap(); + try { + while(true) { + final String line = input.readLine(); + + if (line == null) break; + + String[] columns = line.split(","); + final SccSnippetId id = SccSnippetId + .getByCommaSeparatedPair(line.substring(0, line.indexOf(',', line.indexOf(',') + 1))); + + String path = columns[2]; + // Remove directories from filename + String snippetFileName = path.substring(path.lastIndexOf('/') + 1); + // Remove suffix + snippetFileName = snippetFileName.substring(0, snippetFileName.lastIndexOf('.')); + String[] snippetSubStrings = snippetFileName.split("_"); + int notebookNumber = Integer.parseInt(snippetSubStrings[1]); + addOrIncrease(snippetsPerNotebook, "nb_" + snippetSubStrings[1] + ".ipynb"); + notebookNumbers.put(id, notebookNumber); + snippetIndices.put(id, Integer.parseInt(snippetSubStrings[2])); + /* Here we use the number of lines of source code (comments + excluded), which is inconsistent with the clone analysis of the + notebook files, but so is the clone detection -SourcererCC + doesn't consider comments in clone analysis. */ + int loc = Integer.parseInt(columns[8]); + linesOfCode.put(id, loc); + } + input.close(); + } catch (IOException e) { + e.printStackTrace(System.err); + } + } - // for (List il : clones) { - // a.add(new HashSet(il)); - // } - // for (List il : clones2) { - // b.add(new HashSet(il)); - // } - - // System.err.println("ALTERNATIVE " + a.equals(b)); - List> clones = getCloneListsAlt(pairFile); - return getCloneMap(clones); - } + /** + * If map contains a value for key, increase it with 1. Else add an entry + * with for key with the value 1. + * @param map Map to modify as stated above + * @param key Key for the entry that will be changed/added + */ + private void addOrIncrease(Map map, String key) { + if (map.containsKey(key)) { + map.put(key, map.get(key) + 1); + } else { + map.put(key, 1); + } + } + + /** + * Create a mapping from snippets to notebooks (hash2files) using output + * files from SourcererCC. + */ + private Map> getClones(String pairFile) throws IOException { + List> clones = getCloneLists(pairFile); + + constructConnectionGraphInfo(clones); + // System.exit(0); + + return getCloneMap(clones); + } - private List> getCloneLists(String pairFile) throws FileNotFoundException { - List> clones = new ArrayList>(); - Scanner scanner = new Scanner(new File(pairFile)); - int numRead = 0; - long t1 = System.currentTimeMillis(); - while (scanner.hasNextLine()) { - String line = scanner.nextLine(); - assert(line.matches("[0-9]+,[0-9]+,[0-9]+,[0-9]+")); - String[] numbers = line.split(","); - SccSnippetId id1 = new SccSnippetId(Integer.parseInt(numbers[0]), Integer.parseInt(numbers[1])); - SccSnippetId id2 = new SccSnippetId(Integer.parseInt(numbers[2]), Integer.parseInt(numbers[3])); - boolean bothStored = false; - Iterator> it = clones.iterator(); - while (!bothStored && it.hasNext()) { - List existing = it.next(); - boolean id1stored = existing.contains(id1); - boolean id2stored = existing.contains(id2); - if(id1stored && id2stored) { - bothStored = true; - } else if (id1stored && !id2stored) { - existing.add(id2); - bothStored = true; - } else if(id2stored && !id1stored) { - existing.add(id1); - bothStored = true; - } - } - if (!bothStored) { - List newCloneList = new ArrayList(); - newCloneList.add(id1); - newCloneList.add(id2); - clones.add(newCloneList); - } - numRead++; - if (0 == numRead%10000) { - long t2 = System.currentTimeMillis(); - System.out.println(numRead + " clone pairs read in " + (t2 - t1) + " ms"); - t1 = t2; - } - } - scanner.close(); - return clones; - } - - private List> getCloneListsAlt(String pairFile) throws FileNotFoundException { - Scanner scanner = new Scanner(new File(pairFile)); - int numRead = 0; - - HashMap> clones = new HashMap>(); - - long t1 = System.currentTimeMillis(); - - while (scanner.hasNextLine()) { - String line = scanner.nextLine(); - assert(line.matches("[0-9]+,[0-9]+,[0-9]+,[0-9]+")); - String[] numbers = line.split(","); - SccSnippetId id1 = new SccSnippetId(Integer.parseInt(numbers[0]), Integer.parseInt(numbers[1])); - SccSnippetId id2 = new SccSnippetId(Integer.parseInt(numbers[2]), Integer.parseInt(numbers[3])); - - HashSet id1Clones = clones.get(id1); - HashSet id2Clones = clones.get(id2); - - if (id1Clones == id2Clones) { - if (id1Clones != null) { - /// We already had them marked as clones - } else { - /// Create a new clone set with this data - HashSet newSet = new HashSet(); - newSet.add(id1); - newSet.add(id2); - clones.put(id1, newSet); - clones.put(id2, newSet); + private List> getCloneLists(String pairFile) throws FileNotFoundException { + File cachedResultsOfPreviousRun = new File("getCloneLists.result.txt"); + + if (cachedResultsOfPreviousRun.exists()) { + try (BufferedReader f = new BufferedReader(new FileReader(cachedResultsOfPreviousRun))) { + ArrayList> result = new ArrayList>(); + + for (String line = f.readLine(); line != null; line = f.readLine()) { + ArrayList innerResult = new ArrayList(); + + for (String entry : line.split(",")) { + if (entry.length() > 0) { + innerResult.add(SccSnippetId.getByPair(entry)); + } + } + + result.add(innerResult); } - } else { - /// Merge the sets as they are both clones, and point both to same set - if (id1Clones == null) { - id2Clones.add(id1); - clones.put(id1, id2Clones); - } else if (id2Clones == null) { - id1Clones.add(id2); - clones.put(id2, id1Clones); - } else { - id1Clones.addAll(id2Clones); - clones.put(id2, id1Clones); + + return result; + } catch (IOException e) { + /// fixme add printout + System.exit(-1); + } + } + + return computeCloneLists(pairFile); + } + + private List> computeCloneLists(String pairFile) throws FileNotFoundException { + HashMap clones = new HashMap(); + + long numRead = 0; + try { + final BufferedReader file = new BufferedReader(new FileReader(new File(pairFile))); + + for (String line = file.readLine(); line != null; line = file.readLine()) { + CloneGroup.addToCloneList(clones, line); + ++numRead; + + if ((numRead % 5000000) == 0) { + CloneGroup.compact(clones); + } + + if ((numRead % 1000000) == 0) { + printTimeStampedMsg(numRead + " clone pairs read"); } } - numRead++; - if (0 == numRead%10000) { - long t2 = System.currentTimeMillis(); - System.out.println(numRead + " clone pairs read in " + (t2 - t1) + " ms"); - t1 = t2; + file.close(); + printTimeStampedMsg("Done reading clone pairs (" + clones.size() + ") keys in clones"); + + printTimeStampedMsg("calling convertResult"); + List> result = CloneGroup.convertResult(clones, SccSnippetId.directory); + printTimeStampedMsg("returning from convertResult"); + clones = null; + + FileWriter fw = new FileWriter(new File("getCloneLists.result.txt")); + for (List list : result) { + for(SccSnippetId sid : list) { + fw.write(sid.toString()); + fw.write(','); + } + fw.write('\n'); } + fw.close(); + + return result; + + } catch (IOException e) { + e.printStackTrace(System.err); } - List> result = new ArrayList>(); + return null; + } - HashSet> alreadyIncluded = new HashSet>(); - for(HashSet list : clones.values()) { - if (alreadyIncluded.contains(list) == false) { - result.add(new ArrayList(list)); - alreadyIncluded.add(list); - } + class Counter { + int value = 1; + public Counter inc() { + this.value += 1; + return this; } + } - scanner.close(); - return result; + private void postProcessRepoConnections(HashMap nbOccurrences, boolean emptyConnections) { + List noteBooks = + new ArrayList(nbOccurrences.keySet()); + + final int noteBooksSize = noteBooks.size(); + + if (noteBooksSize > 100000) { + printTimeStampedMsg("Skipping big (" + noteBooksSize + ") clonelist"); + /// TODO: optimise + return; + } + + for (int i = 0; i < noteBooksSize; ++i) { + NotebookFile nbi = noteBooks.get(i); + final Repository ri = nbi.getRepo(); + + for (int j = i; j < noteBooksSize; ++j) { + NotebookFile nbj = noteBooks.get(j); + final Repository rj = nbj.getRepo(); + + if (ri == rj) { + nbi.addIntraRepoConnections(nbOccurrences.get(nbj).value, emptyConnections); + nbj.addIntraRepoConnections(nbOccurrences.get(nbi).value, emptyConnections); + } else { + nbi.addInterRepoConnections(nbOccurrences.get(nbj).value, emptyConnections); + nbj.addInterRepoConnections(nbOccurrences.get(nbi).value, emptyConnections); + } + } + } } - + + private void constructConnectionGraphInfo(List> cloneLists) { + printTimeStampedMsg("constructConnectionGraphInfo start"); + + /// For each list of snippets considered clones ... + for (List cloneList : cloneLists) { + final int totalConnections = cloneList.size(); + + final HashMap notebookOccurrences = new HashMap(); + final HashMap notebookOccurrencesEmptySnippets = new HashMap(); + /// Count the number of times each notebook appears (separate empty and non-empty) + for (SccSnippetId sid : cloneList) { + final NotebookFile nb = sid.getNotebook(); + final HashMap map = (sid.isEmpty() + ? notebookOccurrencesEmptySnippets + : notebookOccurrences); + /// if nb --> c exists, do c.inc() else, install new counter with value 1 + map.compute(nb, (k, c) -> (c == null) ? new Counter() : c.inc()); + } + + for (Map.Entry kv : notebookOccurrences.entrySet()) { + NotebookFile nb = kv.getKey(); + int intraConnections = kv.getValue().value; + + nb.addIntraConnections(intraConnections, false); + nb.addInterConnections(totalConnections - intraConnections, false); + } + + for (Map.Entry kv : notebookOccurrencesEmptySnippets.entrySet()) { + NotebookFile nb = kv.getKey(); + int intraConnections = kv.getValue().value; + + nb.addIntraConnections(intraConnections, true); + nb.addInterConnections(totalConnections - intraConnections, true); + } + + postProcessRepoConnections(notebookOccurrences, false); + postProcessRepoConnections(notebookOccurrencesEmptySnippets, true); + } + + printTimeStampedMsg("printing starts"); + + NotebookFile.forAll(nb -> { + final float normalizedConnections = 0; // TODO + final float normalizedNonEmptyConnections = 0; // TODO + final float meanInterReproConnections = 0; // TODO + final float meanNonEmptyInterReproConnections = 0; // TODO + + final int nonEmptyConnections = nb.intraConnections() + nb.interConnections(); + final int connections = nonEmptyConnections + nb.intraEmptyConnections() + nb.interEmptyConnections(); + + final int nonEmptyIntraReproConnections = nb.intraRepoConnections(); + final int intraReproConnections = nonEmptyIntraReproConnections + nb.intraRepoEmptyConnections(); + + String result = String.format(Locale.US, + "%s, %d, %.4f, %d, %.4f, %d, %d, %.4f, %.4f", + nb.fileName(), + connections, + normalizedConnections, + nonEmptyConnections, + normalizedNonEmptyConnections, + intraReproConnections, + nonEmptyIntraReproConnections, + meanInterReproConnections, + meanNonEmptyInterReproConnections); + + // TODO: write this to file instead + System.err.println(result); + }); + } + private Map> getCloneMap(List> clones) - throws FileNotFoundException { - Map> result = new HashMap>(clones.size()); - Set snippetIdsToAdd = notebookNumbers.keySet(); - int hashIndex = 0; + throws FileNotFoundException { + Map> result = new HashMap>(clones.size()); + Set snippetIdsToAdd = notebookNumbers.keySet(); + int hashIndex = 0; - // Cloned snippets - for (List cloned: clones) { - if (0 == hashIndex%10000) { - System.out.println("Creating entry for " + hashIndex + " in snippet-to-files-map."); - } - List snippets = new ArrayList(); - int numClones = cloned.size(); - List loc = new ArrayList(numClones); - for (int i=0; i cloned: clones) { + if (0 == hashIndex%100000000) { + SccOutputAnalyzer.printTimeStampedMsg("Creating entry for " + hashIndex + " in snippet-to-files-map."); + } + List snippets = new ArrayList(); + int numClones = cloned.size(); + List loc = new ArrayList(numClones); + for (int i=0; i snippets = new ArrayList<>(1); - addSnippet(id, snippets); - snippetIdsToAdd.remove(id); - int loc = linesOfCode.get(id); - SnippetCode hash = new SnippetCode(loc, Integer.toString(hashIndex++)); - result.put(hash, snippets); - } - return result; - } - - /** - * Add the snippet with the specified SourcererCC snippet id to the list - * specified. - * @param id SourcererCC snippet id of snippet to add - * @param snippets List of snippets, to which the snippet will be added - */ - private void addSnippet(SccSnippetId id, List snippets) { - String notebookName = getNotebookNameFromNumber(notebookNumbers.get(id)); - int snippetIndex = snippetIndices.get(id); - snippets.add(new Snippet(notebookName, repros.get(notebookName), snippetIndex)); - } + // Remaining snippets are unique. Add them! + for (SccSnippetId id: snippetIdsToAdd) { + if (0 == hashIndex%100000000) { + printTimeStampedMsg("Creating entry for " + hashIndex + " in snippet-to-files-map."); + } + List snippets = new ArrayList<>(1); + addSnippet(id, snippets); + /// snippetIdsToAdd.remove(id); /// FIXME: this line throws a ConcurrentModificationException + int loc = linesOfCode.get(id); + SnippetCode hash = new SnippetCode(loc, Integer.toString(hashIndex++)); + result.put(hash, snippets); + } + return result; + } + + /** + * Add the snippet with the specified SourcererCC snippet id to the list + * specified. + * @param id SourcererCC snippet id of snippet to add + * @param snippets List of snippets, to which the snippet will be added + */ + private void addSnippet(SccSnippetId id, List snippets) { + String notebookName = getNotebookNameFromNumber(notebookNumbers.get(id)); + int snippetIndex = snippetIndices.get(id); + snippets.add(new Snippet(notebookName, repros.get(notebookName), snippetIndex)); + } - private Map getSnippets(Map> snippet2file) { - Map result = new HashMap(snippetsPerNotebook.size()); - // Create arrays for snippets - for (String notebookName: snippetsPerNotebook.keySet()) { - String repro = repros.get(notebookName); - result.put(new Notebook(notebookName, repro), new SnippetCode[snippetsPerNotebook.get(notebookName)]); + private Map getSnippets(Map> snippet2file) { + Map result = new HashMap(snippetsPerNotebook.size()); + // Create arrays for snippets + for (String notebookName: snippetsPerNotebook.keySet()) { + String repro = repros.get(notebookName); + result.put(new Notebook(notebookName, repro), new SnippetCode[snippetsPerNotebook.get(notebookName)]); - } - // Put snippet in notebook-to-snippet-map - int numAdded = 0; - for (SnippetCode hash: snippet2file.keySet()) { - if (0 == numAdded%10000) { - System.out.println("Adding snippet " + hash + " to notebook-to-snippet-map."); - } - for (Snippet snippet: snippet2file.get(hash)) { - SnippetCode[] snippetsInFile = result.get(new Notebook(snippet.getFileName())); - snippetsInFile[snippet.getSnippetIndex()] = new SnippetCode(hash); - } - numAdded++; - } - return result; - } - - private static String getNotebookNameFromNumber(int notebookNumber) { - return "nb_" + notebookNumber + ".ipynb"; - } + } + // Put snippet in notebook-to-snippet-map + int numAdded = 0; + for (SnippetCode hash: snippet2file.keySet()) { + if (0 == numAdded%100000000) { + printTimeStampedMsg("Adding snippet " + hash + " to notebook-to-snippet-map."); + } + for (Snippet snippet: snippet2file.get(hash)) { + SnippetCode[] snippetsInFile = result.get(new Notebook(snippet.getFileName())); + snippetsInFile[snippet.getSnippetIndex()] = new SnippetCode(hash); + } + numAdded++; + } + return result; + } - void analyze(String[] args) { - String pairFile = null; - - // Set up - for (int i=0; i addToList(HashMap clones, CloneGroup c, Map intToSnippet) { + ArrayList result = new ArrayList(); + + Set keys = clones.keySet(); + int[] array = new int[keys.size()]; + int index = 0; + for(Integer element : keys) array[index++] = element.intValue(); + + for(int key : array) { + CloneGroup value = clones.get(key); + if (value == c) { + result.add(intToSnippet.get(key)); + clones.remove(key); + } + } + + return result; + } + + public static List> convertResult(HashMap clones, + List intToSnippet) { + return new ArrayList>(invertMap(clones, intToSnippet).values()); + } + + public static HashMap> invertMap(HashMap clones, List intToSnippet) { + // Required for correctness + CloneGroup.compact(clones); + + final HashMap> outerResult = new HashMap>(); + + final Set keySet = clones.keySet(); + SccOutputAnalyzer.printTimeStampedMsg("inverting map with domain size: " + keySet.size()); + + int progress = 0; + for (Integer key : keySet) { + if (progress++ % 10000 == 0) SccOutputAnalyzer.printTimeStampedMsg("Processed " + progress + " keys"); + + CloneGroup cg = clones.get(key); + List list = outerResult.get(cg); + + if (list == null) { + list = new ArrayList(); + list.add(intToSnippet.get(key)); + outerResult.put(cg, list); + } else { + list.add(intToSnippet.get(key)); + } + } + + return outerResult; + } + + + public static long compact(HashMap clones) { + long compaction = 0; + + for(Map.Entry entry : clones.entrySet()){ + CloneGroup cs = entry.getValue(); + + if (cs.next == null) continue; + + entry.setValue(cs.top()); + ++compaction; + } + + return compaction; + } + + public static void addToCloneList(HashMap clones, String line) { + int middleComma = line.indexOf(',', line.indexOf(',') + 1); + Integer id1 = null; + Integer id2 = null; + + try { + id1 = SccSnippetId.getId(line.substring(0, middleComma)); + id2 = SccSnippetId.getId(line.substring(middleComma + 1)); + + } catch (NumberFormatException nfe) { + /// Nothing to do -- happens only once! + return; + } + // Integer id1 = line.substring(0, middleComma).hashCode(); + // Integer id2 = line.substring(middleComma + 1).hashCode(); + + // assert(line.matches("[0-9]+,[0-9]+,[0-9]+,[0-9]+")); + + if (id1 == null) { + System.err.println(id1); + System.err.println(id2); + System.err.println(line.substring(0, middleComma)); + System.err.println(line.substring(middleComma + 1)); + System.err.println(line); + } + + CloneGroup id1Clones = clones.get(id1); + CloneGroup id2Clones = clones.get(id2); + + if (id1Clones == id2Clones) { + if (id1Clones != null) { + /// We already had them marked as clones + } else { + /// Create a new clone set with this data + CloneGroup top = new CloneGroup(); + clones.put(id1, top); + clones.put(id2, top); + } + } else { + /// Merge the sets as they are both clones, and point both to same set + if (id1Clones == null) { + clones.put(id1, id2Clones.top()); + } else if (id2Clones == null) { + clones.put(id2, id1Clones.top()); + } else { + CloneGroup top = id1Clones.merge(id2Clones); + if (id1Clones != top) clones.put(id1, top); + if (id2Clones != top) clones.put(id2, top); + } + } + } } + + diff --git a/Programs/src/notebooks/SccSnippetId.java b/Programs/src/notebooks/SccSnippetId.java index 7d197bb..eb219c1 100644 --- a/Programs/src/notebooks/SccSnippetId.java +++ b/Programs/src/notebooks/SccSnippetId.java @@ -1,23 +1,112 @@ package notebooks; import java.util.Objects; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ConcurrentLinkedQueue; public class SccSnippetId { - private final int nbID; - private final int snippetID; - + public final int nbID; + public final int snippetID; + + public int linesOfCode = 0; + + public static final java.util.HashMap resolver = new java.util.HashMap(); + public static final java.util.ArrayList directory = new java.util.ArrayList(); + public SccSnippetId(int nbID, int snippetID) { - this.nbID = nbID; - this.snippetID = snippetID; + this.nbID = nbID; + this.snippetID = snippetID; + } + + private SccSnippetId(int nbID, int snippetID, int linesOfCode) { + this(nbID, snippetID); + this.linesOfCode = linesOfCode; + } + + public SccSnippetId(final String sid) throws NumberFormatException { + final int commaPos = sid.indexOf(','); + + final String id1 = sid.substring(0, commaPos); + final String id2 = sid.substring(commaPos + 1); + + this.nbID = Integer.parseInt(id1); + this.snippetID = Integer.parseInt(id2); + } + + public SccSnippetId(final String sid, char separator) throws NumberFormatException { + final int commaPos = sid.indexOf(separator); + + final String id1 = sid.substring(0, commaPos); + final String id2 = sid.substring(commaPos + 1); + + this.nbID = Integer.parseInt(id1); + this.snippetID = Integer.parseInt(id2); } + public NotebookFile getNotebook() { + return NotebookFile.getById(this.nbID); + } + + public boolean isEmpty() { + return this.linesOfCode == 0; // Confirm with Malin what the correct definition is + } + + // static int create(int a, int b) throws NumberFormatException { + // return create(a + "," + b); + // } + + // static int create(String sid) throws NumberFormatException { + // int sidHash = sid.hashCode(); + // if (directory.containsKey(sidHash) == false) { + // directory.put(sidHash, new SccSnippetId(sid)); + // } + // return sidHash; + // } + + static SccSnippetId getByPair(String sidPair) throws NumberFormatException { + final int at = sidPair.indexOf('@'); + final String id = sidPair.substring(0, at) + "," + sidPair.substring(at + 1); + return directory.get(resolver.get(id)); + } + + static SccSnippetId getByCommaSeparatedPair(String id) throws NumberFormatException { + return directory.get(resolver.get(id)); + } + + static Integer getId(String id) throws NumberFormatException { + return resolver.get(id); + } + + public static void register(String sid, int nbIB, int snippetID, int linesOfCode) { + directory.add(new SccSnippetId(nbIB, snippetID, linesOfCode)); + resolver.put(sid, directory.size()); + } + + public static void register(String info) { + final int firstComma = info.indexOf(','); + final int secondComma = info.indexOf(',', firstComma + 1); + final int ultimateComma = info.lastIndexOf(','); + final int penUltimateComma = info.lastIndexOf(',', ultimateComma - 1); + + final int nbID = Integer.parseInt(info.substring(0, firstComma)); + final int snippetID = Integer.parseInt(info.substring(firstComma + 1, secondComma)); + final int nbNumber = Integer.parseInt(info.substring(info.indexOf('_') + 1, + info.indexOf('.'))); + final int linesOfCode = Integer.parseInt(info.substring(penUltimateComma + 1, ultimateComma)); + + SccSnippetId.register(info.substring(0, secondComma), nbID, snippetID, linesOfCode); + NotebookFile.register(nbID, nbNumber); + } + @Override public boolean equals(Object other) { - if (other.getClass() != this.getClass()) { + if (other instanceof SccSnippetId) { + SccSnippetId otherId = (SccSnippetId)other; + return this.nbID == otherId.nbID && this.snippetID == otherId.snippetID; + } else { return false; - } - SccSnippetId otherId = (SccSnippetId)other; - return this.nbID == otherId.nbID && this.snippetID == otherId.snippetID; + } } @Override diff --git a/Programs/src/notebooks/ThreadExecutor.java b/Programs/src/notebooks/ThreadExecutor.java index 699c8d1..be3f9a7 100644 --- a/Programs/src/notebooks/ThreadExecutor.java +++ b/Programs/src/notebooks/ThreadExecutor.java @@ -13,12 +13,12 @@ */ public class ThreadExecutor { private static ThreadExecutor instance; - private ExecutorService threadPool; + public final ExecutorService threadPool; private ThreadExecutor() { int cores = Runtime.getRuntime().availableProcessors(); - System.out.println("Setting up a thread pool with " + (2*cores) + " threads."); - threadPool = Executors.newFixedThreadPool(2*cores); + System.out.println("Setting up a thread pool with " + (cores/2) + " threads."); + threadPool = Executors.newFixedThreadPool(cores/2); } /** diff --git a/Programs/src/notebooks/Utils.java b/Programs/src/notebooks/Utils.java index c1d9a80..a9069b8 100644 --- a/Programs/src/notebooks/Utils.java +++ b/Programs/src/notebooks/Utils.java @@ -12,6 +12,7 @@ public class Utils { * @return median of values */ public static int median(List values, String msg) { + if (values.size() == 0) return 0; // FIXME: defensive but should it be offensive? Collections.sort(values); int min = values.get(0); int max = values.get(values.size()-1);