Skip to content
This repository has been archived by the owner on Aug 24, 2022. It is now read-only.

various small fix/improvements to R scripts for graph analysis #62

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 57 additions & 16 deletions codeface/R/cluster/community_metrics.r
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,7 @@ compute.project.graph.trends <-
revision.data[sapply(revision.data, is.null)] <- NULL

## Create igraph object and select communities which are of a minimum size 4
loginfo("Compute communities")
revision.data <-
mclapply(revision.data, mc.cores=n.cores,
function(rev) {
Expand All @@ -781,6 +782,7 @@ compute.project.graph.trends <-
revision.data[sapply(revision.data, is.null)] <- NULL

## Compute network metrics
loginfo("Compute network metrics")
revision.df.list <-
mclapply(revision.data, mc.cores=n.cores,
function(rev) {
Expand Down Expand Up @@ -848,8 +850,34 @@ plot.influence.ts <- function(project.stats) {
return(plot1)
}

build.filenames <- function(outdir, project.name, type.name, analysis.method) {
plot.organization <- c("project", "plot_type")

filenames <- sapply(plot.organization,
function(p.org) {
if(p.org == "project") {
folder.name <- paste(project.name, "_",
analysis.method, sep="")
plot.name <- paste(type.name, ".png", sep="")
}
else if (p.org == "plot_type") {
folder.name <- paste(type.name, "_",
analysis.method, sep="")
plot.name <- paste(project.name, ".png", sep="")
}
output.path <- file.path(outdir, p.org,
folder.name)
dir.create(output.path, recursive=T)
filename <- file.path(output.path, plot.name)
return(filename)
})

return(filenames)
}

plot.box <- function(project.df, feature, outdir) {
loginfo("Plotting box plot for feature %s", feature)

## Select all rows for the feature
keep.row <- project.df$metric == feature
project.df <- project.df[keep.row,]
Expand All @@ -871,13 +899,17 @@ plot.box <- function(project.df, feature, outdir) {
ylim1[1] <- 0
p1 = p0 + coord_cartesian(ylim = ylim1*1.05)

file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="")
dir.create(file.dir, recursive=T)
file.name <- paste(file.dir, "/", feature, ".png",sep="")
ggsave(file.name, p1, height=8, width=20)
file.names <- build.filenames(outdir, project.name, feature,
analysis.method)

sapply(file.names,
function(filename) ggsave(filename, p1, height=8,
width=20))

## Adjusted box plots for skewed data
file.name <- paste(file.dir, "/", feature, "_adjusted.pdf", sep="")
file.names <- build.filenames(outdir, project.name,
paste(feature, "_adjusted.pdf", sep=""),
analysis.method)

pdf(file.name)

Expand All @@ -889,16 +921,18 @@ plot.box <- function(project.df, feature, outdir) {
dev.off()

if(feature %in% c('page.rank','v.degree')) {
file.name <- paste(file.dir, '/', feature, "_distribution.pdf", sep="")
p2 <- ggplot(project.df, aes(x=value)) +
geom_histogram(aes(y=..density..),colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666")
ggsave(file.name, p2, height=8, width=20)
sapply(file.names,
function(file.name) ggsave(file.name, p2, height=8, width=20))
}
}
}

plot.series <- function(project.df, feature, outdir) {
loginfo("Plot time series for feature %s", feature)

## Select all rows for the feature
keep.row <- project.df$metric %in% feature
project.df <- project.df[keep.row,]
Expand All @@ -925,10 +959,10 @@ plot.series <- function(project.df, feature, outdir) {
strip.text.x = element_text(size=15))
}

file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="")
dir.create(file.dir, recursive=T)
file.name <- paste(file.dir, "/time_series_metrics.png",sep="")
ggsave(file.name, p, height=41, width=20)
file.names <- build.filenames(outdir, project.name, "time_series",
analysis.method)
sapply(file.names,
function(file.name) ggsave(file.name, p, height=41, width=20))
}


Expand Down Expand Up @@ -956,10 +990,12 @@ plot.scatter <- function(project.df, feature1, feature2, outdir) {
facet_wrap( ~ cycle) +
geom_smooth(method="lm")

file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="")
dir.create(file.dir, recursive=T)
file.name <- paste(file.dir, "/", feature1, "_vs_", feature2, ".png",sep="")
ggsave(file.name, p, height=40, width=40)
feature <- paste(feature1, "_vs_", feature2, sep="")
file.names <- build.filenames(outdir, project.name, feature,
analysis.method)

sapply(file.names,
function(file.name) ggsave(file.name, p, height=40, width=40))
}
}

Expand Down Expand Up @@ -1002,24 +1038,28 @@ write.plots.trends <- function(trends, markov.chains, developer.classifications,
'num.power.law',
'edge.vert.ratio')


## Generate and save box plots for each project
loginfo("Saving box plots")
dlply(trends, .(p.id), function(df) sapply(metrics.box, function(m)
plot.box(df, m, outdir)))

## Generate and save series plots
loginfo("Saving time series")
dlply(trends, .(p.id), function(df) plot.series(df, metrics.series, outdir))

## Gernerate scatter plots
loginfo("Saving scatter plots")
dlply(trends, .(p.id), function(df) plot.scatter(df, "v.degree",
"cluster.coefficient", outdir))

project.name <- unique(trends$name)
analysis.method <- unique(trends$analysis.method)

file.dir <- paste(outdir, "/", project.name, "_", analysis.method, sep="")
dir.create(file.dir, recursive=T)

## Save markov chain plot
loginfo("Saving Markov chains")
if(!is.null(markov.chains)) {
chain.types <- names(markov.chains)
for (type in chain.types) {
Expand All @@ -1037,6 +1077,7 @@ write.plots.trends <- function(trends, markov.chains, developer.classifications,
}

## Save data to file
loginfo("Save data files")
data <- list(trends=trends,markov.chains=markov.chains,
developer.classifications= developer.classifications,
class.edge.probs=class.edge.probs,
Expand Down
13 changes: 0 additions & 13 deletions codeface/R/cluster/persons.r
Original file line number Diff line number Diff line change
Expand Up @@ -1356,16 +1356,3 @@ test.community.quality.modularity <- function() {
quality <- community.metric(g, g.clust, "modularization")

}

#########################################################################
## Executed Statements
#########################################################################
##----------------------------
## Parse commandline arguments
##----------------------------

config.script.run({
conf <- config.from.args(positional.args=list("resdir", "range.id"),
require.project=TRUE)
performAnalysis(conf$resdir, conf)
})
31 changes: 31 additions & 0 deletions codeface/R/cluster/run_analysis.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#! /usr/bin/env Rscript
## Analyse the developer connections

## This file is part of Codeface. Codeface is free software: you can
## redistribute it and/or modify it under the terms of the GNU General Public
## License as published by the Free Software Foundation, version 2.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##
## Copyright 2010, 2011 by Wolfgang Mauerer <wm@linux-kernel.net>
## Copyright 2012, 2013, Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
## All Rights Reserved.

source("persons.r")

##----------------------------
## Parse commandline arguments
##----------------------------

config.script.run({
conf <- config.from.args(positional.args=list("resdir", "range.id"),
require.project=TRUE)
performAnalysis(conf$resdir, conf)
})
2 changes: 2 additions & 0 deletions codeface/R/network_stream.r
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ build.dev.net.stream <- function(con, project.id, type, dates.df,


construct.edgelist <- function(commit.list, add.co.change.rel, add.semantic.rel) {
if(length(commit.list$commit.df) == 0) return(list())

## Compute relation for developer contribution to common entity
entity.groups <- aggregate.on.common.entity(commit.list$commit.df)

Expand Down
4 changes: 4 additions & 0 deletions codeface/R/semantic_dependency.r
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,11 @@ computeSemanticCoupling <- function(depend.df, threshold=0.5) {
tdm <- processTermDocMat(corp)

## Compute document similarity using latent semantic analysis
loginfo("Computing document similarity")
dist.mat <- computeDocSimilarity(tdm)

## Remove documents that have low similarity
loginfo("Remove dissimilar documents")
edgelist <- cmpfun(getSimDocIds)(dist.mat, threshold)

## Mapping of document ids to document names
Expand All @@ -196,5 +198,7 @@ computeSemanticCoupling <- function(depend.df, threshold=0.5) {

res <- list(edgelist=edgelist, vertex.data=vertex.data)

loginfo("Finished semantic similarity computation")

return(res)
}
6 changes: 4 additions & 2 deletions codeface/dbmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(self, conf):
raise
self.cur = self.con.cursor()

max_packet_size = 1024 * 1024 * 256
max_packet_size = 1024 * 1024 * 512
self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,))

def __del__(self):
Expand All @@ -85,7 +85,9 @@ def doExec(self, stmt, args=None):
if dbe.args[0] == 1213: # Deadlock! retry...
log.warning("Recoverable deadlock in MySQL - retrying.")
elif dbe.args[0] == 2006: # Server gone away...
log.warning("MySQL Server gone away, trying to reconnect.")
log.warning("MySQL Server gone away, trying to "
"reconnect. If warning persists, try "
"increasing the max_allowed_packet size.")
self.con.ping(True)
elif dbe.args[0] == 2013: # Lost connection to MySQL server during query...
log.warning("Lost connection to MySQL server during query, trying to reconnect.")
Expand Down
2 changes: 1 addition & 1 deletion codeface/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def project_analyse(resdir, gitdir, codeface_conf, project_conf,

#########
# STAGE 2: Cluster analysis
exe = abspath(resource_filename(__name__, "R/cluster/persons.r"))
exe = abspath(resource_filename(__name__, "R/cluster/run_analysis.r"))
cwd, _ = pathsplit(exe)
cmd = []
cmd.append(exe)
Expand Down