From 337afd60751285ac1325e0967fc5fb0a025ae7a8 Mon Sep 17 00:00:00 2001 From: Mitchell Joblin Date: Sat, 21 Oct 2017 21:03:00 +0200 Subject: [PATCH 1/5] Catch edge case when input data frame is length 0 Signed-off-by: Mitchell Joblin --- codeface/R/network_stream.r | 2 ++ 1 file changed, 2 insertions(+) diff --git a/codeface/R/network_stream.r b/codeface/R/network_stream.r index 5fad97b7..5b8e3d29 100755 --- a/codeface/R/network_stream.r +++ b/codeface/R/network_stream.r @@ -112,6 +112,8 @@ build.dev.net.stream <- function(con, project.id, type, dates.df, construct.edgelist <- function(commit.list, add.co.change.rel, add.semantic.rel) { + if(length(commit.list$commit.df) == 0) return(list()) + ## Compute relation for developer contribution to common entity entity.groups <- aggregate.on.common.entity(commit.list$commit.df) From 1feff0f6ab55f0459744c070cf644c38f64d765a Mon Sep 17 00:00:00 2001 From: Mitchell Joblin Date: Sat, 21 Oct 2017 21:05:47 +0200 Subject: [PATCH 2/5] Move entry point to R scripts execution to another file When persons.r expect to be passed command line parameters, it makes it impossible to be sourced by other scripts. Signed-off-by: Mitchell Joblin --- codeface/R/cluster/persons.r | 13 ------------- codeface/R/cluster/run_analysis.r | 31 +++++++++++++++++++++++++++++++ codeface/project.py | 2 +- 3 files changed, 32 insertions(+), 14 deletions(-) create mode 100755 codeface/R/cluster/run_analysis.r diff --git a/codeface/R/cluster/persons.r b/codeface/R/cluster/persons.r index 44b33fdb..fbfffdca 100755 --- a/codeface/R/cluster/persons.r +++ b/codeface/R/cluster/persons.r @@ -1356,16 +1356,3 @@ test.community.quality.modularity <- function() { quality <- community.metric(g, g.clust, "modularization") } - -######################################################################### -## Executed Statements -######################################################################### -##---------------------------- -## Parse commandline arguments -##---------------------------- - -config.script.run({ - conf <- config.from.args(positional.args=list("resdir", "range.id"), - require.project=TRUE) - performAnalysis(conf$resdir, conf) -}) diff --git a/codeface/R/cluster/run_analysis.r b/codeface/R/cluster/run_analysis.r new file mode 100755 index 00000000..2fce2143 --- /dev/null +++ b/codeface/R/cluster/run_analysis.r @@ -0,0 +1,31 @@ +#! /usr/bin/env Rscript +## Analyse the developer connections + +## This file is part of Codeface. Codeface is free software: you can +## redistribute it and/or modify it under the terms of the GNU General Public +## License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +## +## Copyright 2010, 2011 by Wolfgang Mauerer +## Copyright 2012, 2013, Siemens AG, Wolfgang Mauerer +## All Rights Reserved. + +source("persons.r") + +##---------------------------- +## Parse commandline arguments +##---------------------------- + +config.script.run({ + conf <- config.from.args(positional.args=list("resdir", "range.id"), + require.project=TRUE) + performAnalysis(conf$resdir, conf) +}) diff --git a/codeface/project.py b/codeface/project.py index abb0330b..e37114bb 100644 --- a/codeface/project.py +++ b/codeface/project.py @@ -114,7 +114,7 @@ def project_analyse(resdir, gitdir, codeface_conf, project_conf, ######### # STAGE 2: Cluster analysis - exe = abspath(resource_filename(__name__, "R/cluster/persons.r")) + exe = abspath(resource_filename(__name__, "R/cluster/run_analysis.r")) cwd, _ = pathsplit(exe) cmd = [] cmd.append(exe) From b515a5072e8507c1d7c3697cc6bdb6c8183b2b76 Mon Sep 17 00:00:00 2001 From: Mitchell Joblin Date: Sat, 21 Oct 2017 21:14:44 +0200 Subject: [PATCH 3/5] Change how plots are saved Save plots two different ways, according to project and according to plot type This just helps to more quickly find relavent information Signed-off-by: Mitchell Joblin --- codeface/R/cluster/community_metrics.r | 61 +++++++++++++++++++------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/codeface/R/cluster/community_metrics.r b/codeface/R/cluster/community_metrics.r index 4c76cfc2..01c54291 100644 --- a/codeface/R/cluster/community_metrics.r +++ b/codeface/R/cluster/community_metrics.r @@ -848,6 +848,30 @@ plot.influence.ts <- function(project.stats) { return(plot1) } +build.filenames <- function(outdir, project.name, type.name, analysis.method) { + plot.organization <- c("project", "plot_type") + + filenames <- sapply(plot.organization, + function(p.org) { + if(p.org == "project") { + folder.name <- paste(project.name, "_", + analysis.method, sep="") + plot.name <- paste(type.name, ".png", sep="") + } + else if (p.org == "plot_type") { + folder.name <- paste(type.name, "_", + analysis.method, sep="") + plot.name <- paste(project.name, ".png", sep="") + } + output.path <- file.path(outdir, p.org, + folder.name) + dir.create(output.path, recursive=T) + filename <- file.path(output.path, plot.name) + return(filename) + }) + + return(filenames) +} plot.box <- function(project.df, feature, outdir) { ## Select all rows for the feature @@ -871,13 +895,17 @@ plot.box <- function(project.df, feature, outdir) { ylim1[1] <- 0 p1 = p0 + coord_cartesian(ylim = ylim1*1.05) - file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="") - dir.create(file.dir, recursive=T) - file.name <- paste(file.dir, "/", feature, ".png",sep="") - ggsave(file.name, p1, height=8, width=20) + file.names <- build.filenames(outdir, project.name, feature, + analysis.method) + + sapply(file.names, + function(filename) ggsave(filename, p1, height=8, + width=20)) ## Adjusted box plots for skewed data - file.name <- paste(file.dir, "/", feature, "_adjusted.pdf", sep="") + file.names <- build.filenames(outdir, project.name, + paste(feature, "_adjusted.pdf", sep=""), + analysis.method) pdf(file.name) @@ -889,11 +917,11 @@ plot.box <- function(project.df, feature, outdir) { dev.off() if(feature %in% c('page.rank','v.degree')) { - file.name <- paste(file.dir, '/', feature, "_distribution.pdf", sep="") p2 <- ggplot(project.df, aes(x=value)) + geom_histogram(aes(y=..density..),colour="black", fill="white") + geom_density(alpha=.2, fill="#FF6666") - ggsave(file.name, p2, height=8, width=20) + sapply(file.names, + function(file.name) ggsave(file.name, p2, height=8, width=20)) } } } @@ -925,10 +953,10 @@ plot.series <- function(project.df, feature, outdir) { strip.text.x = element_text(size=15)) } - file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="") - dir.create(file.dir, recursive=T) - file.name <- paste(file.dir, "/time_series_metrics.png",sep="") - ggsave(file.name, p, height=41, width=20) + file.names <- build.filenames(outdir, project.name, "time_series", + analysis.method) + sapply(file.names, + function(file.name) ggsave(file.name, p, height=41, width=20)) } @@ -956,10 +984,12 @@ plot.scatter <- function(project.df, feature1, feature2, outdir) { facet_wrap( ~ cycle) + geom_smooth(method="lm") - file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="") - dir.create(file.dir, recursive=T) - file.name <- paste(file.dir, "/", feature1, "_vs_", feature2, ".png",sep="") - ggsave(file.name, p, height=40, width=40) + feature <- paste(feature1, "_vs_", feature2, sep="") + file.names <- build.filenames(outdir, project.name, feature, + analysis.method) + + sapply(file.names, + function(file.name) ggsave(file.name, p, height=40, width=40)) } } @@ -1018,6 +1048,7 @@ write.plots.trends <- function(trends, markov.chains, developer.classifications, analysis.method <- unique(trends$analysis.method) file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="") + dir.create(file.dir, recursive=T) ## Save markov chain plot if(!is.null(markov.chains)) { From fd5b819f1d5ee409f990e02703ca60c89657628d Mon Sep 17 00:00:00 2001 From: Mitchell Joblin Date: Sat, 21 Oct 2017 21:04:14 +0200 Subject: [PATCH 4/5] Add logging statements Signed-off-by: Mitchell Joblin --- codeface/R/cluster/community_metrics.r | 12 +++++++++++- codeface/R/semantic_dependency.r | 4 ++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/codeface/R/cluster/community_metrics.r b/codeface/R/cluster/community_metrics.r index 01c54291..8d06bff4 100644 --- a/codeface/R/cluster/community_metrics.r +++ b/codeface/R/cluster/community_metrics.r @@ -757,6 +757,7 @@ compute.project.graph.trends <- revision.data[sapply(revision.data, is.null)] <- NULL ## Create igraph object and select communities which are of a minimum size 4 + loginfo("Compute communities") revision.data <- mclapply(revision.data, mc.cores=n.cores, function(rev) { @@ -781,6 +782,7 @@ compute.project.graph.trends <- revision.data[sapply(revision.data, is.null)] <- NULL ## Compute network metrics + loginfo("Compute network metrics") revision.df.list <- mclapply(revision.data, mc.cores=n.cores, function(rev) { @@ -874,6 +876,8 @@ build.filenames <- function(outdir, project.name, type.name, analysis.method) { } plot.box <- function(project.df, feature, outdir) { + loginfo("Plotting box plot for feature %s", feature) + ## Select all rows for the feature keep.row <- project.df$metric == feature project.df <- project.df[keep.row,] @@ -927,6 +931,8 @@ plot.box <- function(project.df, feature, outdir) { } plot.series <- function(project.df, feature, outdir) { + loginfo("Plot time series for feature %s", feature) + ## Select all rows for the feature keep.row <- project.df$metric %in% feature project.df <- project.df[keep.row,] @@ -1032,15 +1038,17 @@ write.plots.trends <- function(trends, markov.chains, developer.classifications, 'num.power.law', 'edge.vert.ratio') - ## Generate and save box plots for each project + loginfo("Saving box plots") dlply(trends, .(p.id), function(df) sapply(metrics.box, function(m) plot.box(df, m, outdir))) ## Generate and save series plots + loginfo("Saving time series") dlply(trends, .(p.id), function(df) plot.series(df, metrics.series, outdir)) ## Gernerate scatter plots + loginfo("Saving scatter plots") dlply(trends, .(p.id), function(df) plot.scatter(df, "v.degree", "cluster.coefficient", outdir)) @@ -1051,6 +1059,7 @@ write.plots.trends <- function(trends, markov.chains, developer.classifications, dir.create(file.dir, recursive=T) ## Save markov chain plot + loginfo("Saving Markov chains") if(!is.null(markov.chains)) { chain.types <- names(markov.chains) for (type in chain.types) { @@ -1068,6 +1077,7 @@ write.plots.trends <- function(trends, markov.chains, developer.classifications, } ## Save data to file + loginfo("Save data files") data <- list(trends=trends,markov.chains=markov.chains, developer.classifications= developer.classifications, class.edge.probs=class.edge.probs, diff --git a/codeface/R/semantic_dependency.r b/codeface/R/semantic_dependency.r index cb44cfdb..651bf756 100644 --- a/codeface/R/semantic_dependency.r +++ b/codeface/R/semantic_dependency.r @@ -184,9 +184,11 @@ computeSemanticCoupling <- function(depend.df, threshold=0.5) { tdm <- processTermDocMat(corp) ## Compute document similarity using latent semantic analysis + loginfo("Computing document similarity") dist.mat <- computeDocSimilarity(tdm) ## Remove documents that have low similarity + loginfo("Remove dissimilar documents") edgelist <- cmpfun(getSimDocIds)(dist.mat, threshold) ## Mapping of document ids to document names @@ -196,5 +198,7 @@ computeSemanticCoupling <- function(depend.df, threshold=0.5) { res <- list(edgelist=edgelist, vertex.data=vertex.data) + loginfo("Finished semantic similarity computation") + return(res) } From b0f5253ebfed5efdfce398c4df0aef47bfd7a69c Mon Sep 17 00:00:00 2001 From: Mitchell Joblin Date: Sun, 29 Oct 2017 22:22:54 +0100 Subject: [PATCH 5/5] Increase max packet size for bulk inserts For some very active projects (e.g. LLVM) the current max packet size is too small and leads to "Mysql server gone away error 2006" The max packet size is now 512MB and the warning message suggests to increase this futher if error 2006 persists. Signed-off-by: Mitchell Joblin --- codeface/dbmanager.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/codeface/dbmanager.py b/codeface/dbmanager.py index 7d84bf9b..93f1f8d0 100644 --- a/codeface/dbmanager.py +++ b/codeface/dbmanager.py @@ -64,7 +64,7 @@ def __init__(self, conf): raise self.cur = self.con.cursor() - max_packet_size = 1024 * 1024 * 256 + max_packet_size = 1024 * 1024 * 512 self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,)) def __del__(self): @@ -85,7 +85,9 @@ def doExec(self, stmt, args=None): if dbe.args[0] == 1213: # Deadlock! retry... log.warning("Recoverable deadlock in MySQL - retrying.") elif dbe.args[0] == 2006: # Server gone away... - log.warning("MySQL Server gone away, trying to reconnect.") + log.warning("MySQL Server gone away, trying to " + "reconnect. If warning persists, try " + "increasing the max_allowed_packet size.") self.con.ping(True) elif dbe.args[0] == 2013: # Lost connection to MySQL server during query... log.warning("Lost connection to MySQL server during query, trying to reconnect.")