From 337afd60751285ac1325e0967fc5fb0a025ae7a8 Mon Sep 17 00:00:00 2001
From: Mitchell Joblin <mitchell.joblin.ext@siemens.com>
Date: Sat, 21 Oct 2017 21:03:00 +0200
Subject: [PATCH 1/5] Catch edge case when input data frame is length 0

Signed-off-by: Mitchell Joblin <mitchell.joblin.ext@siemens.com>
---
 codeface/R/network_stream.r | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/codeface/R/network_stream.r b/codeface/R/network_stream.r
index 5fad97b7..5b8e3d29 100755
--- a/codeface/R/network_stream.r
+++ b/codeface/R/network_stream.r
@@ -112,6 +112,8 @@ build.dev.net.stream <- function(con, project.id, type, dates.df,
 
 
 construct.edgelist <- function(commit.list, add.co.change.rel, add.semantic.rel) {
+  if(length(commit.list$commit.df) == 0) return(list())
+
   ## Compute relation for developer contribution to common entity
   entity.groups <- aggregate.on.common.entity(commit.list$commit.df)
 

From 1feff0f6ab55f0459744c070cf644c38f64d765a Mon Sep 17 00:00:00 2001
From: Mitchell Joblin <mitchell.joblin.ext@siemens.com>
Date: Sat, 21 Oct 2017 21:05:47 +0200
Subject: [PATCH 2/5] Move entry point to R scripts execution to another file

When persons.r expect to be passed command line parameters,
it makes it impossible to be sourced by other scripts.

Signed-off-by: Mitchell Joblin <mitchell.joblin.ext@siemens.com>
---
 codeface/R/cluster/persons.r      | 13 -------------
 codeface/R/cluster/run_analysis.r | 31 +++++++++++++++++++++++++++++++
 codeface/project.py               |  2 +-
 3 files changed, 32 insertions(+), 14 deletions(-)
 create mode 100755 codeface/R/cluster/run_analysis.r

diff --git a/codeface/R/cluster/persons.r b/codeface/R/cluster/persons.r
index 44b33fdb..fbfffdca 100755
--- a/codeface/R/cluster/persons.r
+++ b/codeface/R/cluster/persons.r
@@ -1356,16 +1356,3 @@ test.community.quality.modularity <- function() {
   quality <- community.metric(g, g.clust, "modularization")
 
 }
-
-#########################################################################
-##     					 Executed Statements
-#########################################################################
-##----------------------------
-## Parse commandline arguments
-##----------------------------
-
-config.script.run({
-  conf <- config.from.args(positional.args=list("resdir", "range.id"),
-                           require.project=TRUE)
-  performAnalysis(conf$resdir, conf)
-})
diff --git a/codeface/R/cluster/run_analysis.r b/codeface/R/cluster/run_analysis.r
new file mode 100755
index 00000000..2fce2143
--- /dev/null
+++ b/codeface/R/cluster/run_analysis.r
@@ -0,0 +1,31 @@
+#! /usr/bin/env Rscript
+## Analyse the developer connections
+
+## This file is part of Codeface. Codeface is free software: you can
+## redistribute it and/or modify it under the terms of the GNU General Public
+## License as published by the Free Software Foundation, version 2.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+## FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+##
+## Copyright 2010, 2011 by Wolfgang Mauerer <wm@linux-kernel.net>
+## Copyright 2012, 2013, Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
+## All Rights Reserved.
+
+source("persons.r")
+
+##----------------------------
+## Parse commandline arguments
+##----------------------------
+
+config.script.run({
+  conf <- config.from.args(positional.args=list("resdir", "range.id"),
+                           require.project=TRUE)
+  performAnalysis(conf$resdir, conf)
+})
diff --git a/codeface/project.py b/codeface/project.py
index abb0330b..e37114bb 100644
--- a/codeface/project.py
+++ b/codeface/project.py
@@ -114,7 +114,7 @@ def project_analyse(resdir, gitdir, codeface_conf, project_conf,
 
         #########
         # STAGE 2: Cluster analysis
-        exe = abspath(resource_filename(__name__, "R/cluster/persons.r"))
+        exe = abspath(resource_filename(__name__, "R/cluster/run_analysis.r"))
         cwd, _ = pathsplit(exe)
         cmd = []
         cmd.append(exe)

From b515a5072e8507c1d7c3697cc6bdb6c8183b2b76 Mon Sep 17 00:00:00 2001
From: Mitchell Joblin <mitchell.joblin.ext@siemens.com>
Date: Sat, 21 Oct 2017 21:14:44 +0200
Subject: [PATCH 3/5] Change how plots are saved

Save plots two different ways, according to project and according
to plot type

This just helps to more quickly find relavent information

Signed-off-by: Mitchell Joblin <mitchell.joblin.ext@siemens.com>
---
 codeface/R/cluster/community_metrics.r | 61 +++++++++++++++++++-------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/codeface/R/cluster/community_metrics.r b/codeface/R/cluster/community_metrics.r
index 4c76cfc2..01c54291 100644
--- a/codeface/R/cluster/community_metrics.r
+++ b/codeface/R/cluster/community_metrics.r
@@ -848,6 +848,30 @@ plot.influence.ts <- function(project.stats) {
   return(plot1)
 }
 
+build.filenames <- function(outdir, project.name, type.name, analysis.method) {
+  plot.organization <- c("project", "plot_type")
+
+  filenames <- sapply(plot.organization,
+                      function(p.org) {
+                        if(p.org == "project") {
+                          folder.name <- paste(project.name, "_",
+                                               analysis.method, sep="")
+                          plot.name <- paste(type.name, ".png", sep="")
+                        }
+                        else if (p.org == "plot_type") {
+                          folder.name <- paste(type.name, "_",
+                                               analysis.method, sep="")
+                          plot.name <- paste(project.name, ".png", sep="")
+                        }
+                        output.path <- file.path(outdir, p.org,
+                                                 folder.name)
+                        dir.create(output.path, recursive=T)
+                        filename <- file.path(output.path, plot.name)
+                        return(filename)
+                      })
+
+  return(filenames)
+}
 
 plot.box <- function(project.df, feature, outdir) {
   ## Select all rows for the feature
@@ -871,13 +895,17 @@ plot.box <- function(project.df, feature, outdir) {
     ylim1[1] <- 0
     p1 = p0 + coord_cartesian(ylim = ylim1*1.05)
 
-    file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="")
-    dir.create(file.dir, recursive=T)
-    file.name <- paste(file.dir, "/", feature, ".png",sep="")
-    ggsave(file.name, p1, height=8, width=20)
+    file.names <- build.filenames(outdir, project.name, feature,
+                                  analysis.method)
+
+    sapply(file.names,
+           function(filename) ggsave(filename, p1, height=8,
+                                     width=20))
 
     ## Adjusted box plots for skewed data
-    file.name <- paste(file.dir, "/", feature, "_adjusted.pdf", sep="")
+    file.names <- build.filenames(outdir, project.name,
+                                  paste(feature, "_adjusted.pdf", sep=""),
+                                  analysis.method)
 
     pdf(file.name)
 
@@ -889,11 +917,11 @@ plot.box <- function(project.df, feature, outdir) {
     dev.off()
 
     if(feature %in% c('page.rank','v.degree')) {
-      file.name <- paste(file.dir, '/', feature, "_distribution.pdf", sep="")
       p2 <- ggplot(project.df, aes(x=value)) +
             geom_histogram(aes(y=..density..),colour="black", fill="white") +
             geom_density(alpha=.2, fill="#FF6666")
-      ggsave(file.name, p2, height=8, width=20)
+      sapply(file.names,
+             function(file.name) ggsave(file.name, p2, height=8, width=20))
     }
   }
 }
@@ -925,10 +953,10 @@ plot.series <- function(project.df, feature, outdir) {
                       strip.text.x = element_text(size=15))
   }
 
-  file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="")
-  dir.create(file.dir, recursive=T)
-  file.name <- paste(file.dir, "/time_series_metrics.png",sep="")
-  ggsave(file.name, p, height=41, width=20)
+  file.names <- build.filenames(outdir, project.name, "time_series",
+                                  analysis.method)
+  sapply(file.names,
+         function(file.name) ggsave(file.name, p, height=41, width=20))
 }
 
 
@@ -956,10 +984,12 @@ plot.scatter <- function(project.df, feature1, feature2, outdir) {
         facet_wrap( ~ cycle) +
         geom_smooth(method="lm")
 
-    file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="")
-    dir.create(file.dir, recursive=T)
-    file.name <- paste(file.dir, "/", feature1, "_vs_", feature2, ".png",sep="")
-    ggsave(file.name, p, height=40, width=40)
+    feature <- paste(feature1, "_vs_", feature2, sep="")
+    file.names <- build.filenames(outdir, project.name, feature,
+                                  analysis.method)
+
+    sapply(file.names,
+           function(file.name) ggsave(file.name, p, height=40, width=40))
   }
 }
 
@@ -1018,6 +1048,7 @@ write.plots.trends <- function(trends, markov.chains, developer.classifications,
   analysis.method <- unique(trends$analysis.method)
 
   file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="")
+  dir.create(file.dir, recursive=T)
 
   ## Save markov chain plot
   if(!is.null(markov.chains)) {

From fd5b819f1d5ee409f990e02703ca60c89657628d Mon Sep 17 00:00:00 2001
From: Mitchell Joblin <mitchell.joblin.ext@siemens.com>
Date: Sat, 21 Oct 2017 21:04:14 +0200
Subject: [PATCH 4/5] Add logging statements

Signed-off-by: Mitchell Joblin <mitchell.joblin.ext@siemens.com>
---
 codeface/R/cluster/community_metrics.r | 12 +++++++++++-
 codeface/R/semantic_dependency.r       |  4 ++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/codeface/R/cluster/community_metrics.r b/codeface/R/cluster/community_metrics.r
index 01c54291..8d06bff4 100644
--- a/codeface/R/cluster/community_metrics.r
+++ b/codeface/R/cluster/community_metrics.r
@@ -757,6 +757,7 @@ compute.project.graph.trends <-
       revision.data[sapply(revision.data, is.null)] <- NULL
 
       ## Create igraph object and select communities which are of a minimum size 4
+      loginfo("Compute communities")
       revision.data <-
         mclapply(revision.data, mc.cores=n.cores,
                  function(rev) {
@@ -781,6 +782,7 @@ compute.project.graph.trends <-
       revision.data[sapply(revision.data, is.null)] <- NULL
 
       ## Compute network metrics
+      loginfo("Compute network metrics")
       revision.df.list <-
         mclapply(revision.data, mc.cores=n.cores,
                  function(rev) {
@@ -874,6 +876,8 @@ build.filenames <- function(outdir, project.name, type.name, analysis.method) {
 }
 
 plot.box <- function(project.df, feature, outdir) {
+  loginfo("Plotting box plot for feature %s", feature)
+
   ## Select all rows for the feature
   keep.row <- project.df$metric == feature
   project.df <- project.df[keep.row,]
@@ -927,6 +931,8 @@ plot.box <- function(project.df, feature, outdir) {
 }
 
 plot.series <- function(project.df, feature, outdir) {
+  loginfo("Plot time series for feature %s", feature)
+
   ## Select all rows for the feature
   keep.row <- project.df$metric %in% feature
   project.df <- project.df[keep.row,]
@@ -1032,15 +1038,17 @@ write.plots.trends <- function(trends, markov.chains, developer.classifications,
                       'num.power.law',
                       'edge.vert.ratio')
 
-
   ## Generate and save box plots for each project
+  loginfo("Saving box plots")
   dlply(trends, .(p.id), function(df) sapply(metrics.box, function(m)
         plot.box(df, m, outdir)))
 
   ## Generate and save series plots
+  loginfo("Saving time series")
   dlply(trends, .(p.id), function(df) plot.series(df, metrics.series, outdir))
 
   ## Gernerate scatter plots
+  loginfo("Saving scatter plots")
   dlply(trends, .(p.id), function(df) plot.scatter(df, "v.degree",
         "cluster.coefficient", outdir))
 
@@ -1051,6 +1059,7 @@ write.plots.trends <- function(trends, markov.chains, developer.classifications,
   dir.create(file.dir, recursive=T)
 
   ## Save markov chain plot
+  loginfo("Saving Markov chains")
   if(!is.null(markov.chains)) {
     chain.types <- names(markov.chains)
     for (type in chain.types) {
@@ -1068,6 +1077,7 @@ write.plots.trends <- function(trends, markov.chains, developer.classifications,
   }
 
   ## Save data to file
+  loginfo("Save data files")
   data <- list(trends=trends,markov.chains=markov.chains,
                developer.classifications= developer.classifications,
                class.edge.probs=class.edge.probs,
diff --git a/codeface/R/semantic_dependency.r b/codeface/R/semantic_dependency.r
index cb44cfdb..651bf756 100644
--- a/codeface/R/semantic_dependency.r
+++ b/codeface/R/semantic_dependency.r
@@ -184,9 +184,11 @@ computeSemanticCoupling <- function(depend.df, threshold=0.5) {
   tdm <- processTermDocMat(corp)
 
   ## Compute document similarity using latent semantic analysis
+  loginfo("Computing document similarity")
   dist.mat <- computeDocSimilarity(tdm)
 
   ## Remove documents that have low similarity
+  loginfo("Remove dissimilar documents")
   edgelist <- cmpfun(getSimDocIds)(dist.mat, threshold)
 
   ## Mapping of document ids to document names
@@ -196,5 +198,7 @@ computeSemanticCoupling <- function(depend.df, threshold=0.5) {
 
   res <- list(edgelist=edgelist, vertex.data=vertex.data)
 
+  loginfo("Finished semantic similarity computation")
+
   return(res)
 }

From b0f5253ebfed5efdfce398c4df0aef47bfd7a69c Mon Sep 17 00:00:00 2001
From: Mitchell Joblin <mitchell.joblin.ext@siemens.com>
Date: Sun, 29 Oct 2017 22:22:54 +0100
Subject: [PATCH 5/5] Increase max packet size for bulk inserts

For some very active projects (e.g. LLVM) the current max packet size
is too small and leads to "Mysql server gone away error 2006"

The max packet size is now 512MB and the warning message
suggests to increase this futher if error 2006 persists.

Signed-off-by: Mitchell Joblin <mitchell.joblin.ext@siemens.com>
---
 codeface/dbmanager.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/codeface/dbmanager.py b/codeface/dbmanager.py
index 7d84bf9b..93f1f8d0 100644
--- a/codeface/dbmanager.py
+++ b/codeface/dbmanager.py
@@ -64,7 +64,7 @@ def __init__(self, conf):
             raise
         self.cur = self.con.cursor()
 
-        max_packet_size = 1024 * 1024 * 256
+        max_packet_size = 1024 * 1024 * 512
         self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,))
 
     def __del__(self):
@@ -85,7 +85,9 @@ def doExec(self, stmt, args=None):
                     if dbe.args[0] == 1213:  # Deadlock! retry...
                         log.warning("Recoverable deadlock in MySQL - retrying.")
                     elif dbe.args[0] == 2006:  # Server gone away...
-                        log.warning("MySQL Server gone away, trying to reconnect.")
+                        log.warning("MySQL Server gone away, trying to "
+                                    "reconnect. If warning persists, try "
+                                    "increasing the max_allowed_packet size.")
                         self.con.ping(True)
                     elif dbe.args[0] == 2013:  # Lost connection to MySQL server during query...
                         log.warning("Lost connection to MySQL server during query, trying to reconnect.")