codelapse/gitlapse.py at master · snewman/codelapse · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
import os
import inspect
from subprocess import *
import tempfile
import sys
from optparse import OptionParser
from xml.dom.minidom import parseString
import re
from decimal import *

class Executor:

    def execute(self, command):
        try:
            print "Running " + command
            p = Popen(command, shell=True, stdout=PIPE)
            retcode = os.waitpid(p.pid, 0)[1]
            if retcode < 0:
                print >>sys.stderr, "Child was terminated by signal", -retcode
                sys.exit(retcode)
            else:
                return p.stdout
        except OSError, e:
            print >>sys.stderr, "Execution failed:", e
            sys.exit(2)

class GitRepo:

    def __init__(self, git_dir, working_dir, executor):
        self.git_dir = git_dir
        self.working_dir = working_dir
        self.executor = executor

    def current_head(self):
        return self.executor.execute('git --git-dir=' + self.git_dir + ' log --format=format:"%H" -1').read()

    def list_commits_to_file(self, destination_file_name):
        self.executor.execute('git --git-dir=' + self.git_dir + ' --no-pager log --format=format:"%H || %ai || %s%n" --date=iso > ' + destination_file_name)
        return open(destination_file_name)

    def commits(self, destination_file_name):
        git_output_file = self.list_commits_to_file(destination_file_name)
        list_of_commits = []

        for line in git_output_file:
            records = line.split('||')
            if len(records) > 1:
                git_commit = records[0]
                date = records[1]
                list_of_commits.append((git_commit, date))

        return list_of_commits

    def hard_reset(self, commit_hash):
        self.executor.execute('git --git-dir=' + self.git_dir + ' --work-tree=' + self.working_dir + ' reset --hard %s' % commit_hash)

class CheckstyleParser:

    def parse(self, checkstyle_report_content):
        dom = parseString(checkstyle_report_content)
        root = dom.getElementsByTagName('checkstyle')[0]
        classes = root.getElementsByTagName('file')
        healthy_class_names = [clazz.getAttribute('name') for clazz in classes if len(clazz.getElementsByTagName('error')) == 0]

        unhealthy_classes = []
        for clazz in classes:
            if len(clazz.getElementsByTagName('error')) > 0:
                errors = {}
                for error in clazz.getElementsByTagName('error'):
                         errors[error.getAttribute('source')] = error.getAttribute('message')

                unhealthy_classes.append(ToxicClass(clazz.getAttribute('name'), errors))


        return ToxicityReport(healthy_class_names, unhealthy_classes)


class CheckstyleExecution:

    def __init__(self, executor, path_to_install):
        self.executor = executor
        self.path_to_install = path_to_install

    def analyse(self, src_directory):
         #java -jar ../../code-time-lapse/tools/checkstyle/checkstyle-all-4.4.jar -c ../../code-time-lapse/tools/checkstyle/metrics.xml -r src -f xml

        stdout = self.executor.execute('java -jar %s/tools/checkstyle/checkstyle-all-4.4.jar -c %s/tools/checkstyle/metrics.xml -r %s -f xml' % (self.path_to_install, self.path_to_install, src_directory))
        return stdout.read()

class ToxicClass:

    def __init__(self, class_name, errors):
        self.errors = errors


class ToxicityReport:

    def __init__(self, healthy_class_names, unhealthy_class_names):
        self.healthy_class_names = healthy_class_names
        self.unhealthy_class_names = unhealthy_class_names

    def number_of_healty_classes(self):
        return len(self.healthy_class_names)

    def number_of_unhealthy_classes(self):
        return len(self.unhealthy_class_names)

class ToxicityCalculator():

    def __init__(self):
        self.handlers = {
            'com.puppycrawl.tools.checkstyle.checks.sizes.MethodLengthCheck' : self.calculate_long_method_length_cost,
            'com.puppycrawl.tools.checkstyle.checks.sizes.FileLengthCheck' : self.calculate_long_class_cost,
            'com.puppycrawl.tools.checkstyle.checks.metrics.ClassDataAbstractionCouplingCheck' : self.calculate_abstraction_coupling_cost}

    def calculate_abstraction_coupling_cost(self, message_string):
        values = self.matches('Class Data Abstraction Coupling is (\d*) \(max allowed is (\d*)\)', message_string)
        return self.cost(values[0], values[1])

    def calculate_long_method_length_cost(self, message_string):
        values = self.matches('Method length is (\d*) lines \(max allowed is (\d*)\).', message_string)
        return self.cost(values[0], values[1])

    def calculate_long_class_cost(self, message_string):
        values = self.matches('File length is (\d*) lines \(max allowed is (\d*)\)', message_string)
        return self.cost(values[0], values[1])

    def matches(self, pattern, string):
        return re.search(pattern, string).groups()

    def toxicity(self, errors):
        score = Decimal(0)

        for error_type in errors.keys():
            score = score + self.handlers[error_type](errors[error_type])

        return self.round_down(score)

    def cost(self, actual, allowed):
        return Decimal(actual) / Decimal(allowed)

    def round_down(self, decimal):
        return decimal.quantize(Decimal('.01'), rounding=ROUND_DOWN)

class SkippingAnalyser:

    def __init__(self, skipping_commits, delegate_analyser, git_repo):
        self.skipping_commits = skipping_commits
        self.delegate_analyser = delegate_analyser
        self.git_repo = git_repo
        self.current_count = 0

    def analyse(self, commit_hash, commit_date):
        self.current_count = self.current_count + 1

        if self.current_count == self.skipping_commits:
            self.git_repo.hard_reset(commit_hash)
            self.delegate_analyser.analyse(commit_hash, commit_date)
            self.current_count = 0

class ClocParser:

    def create_record(self, src_dir, by_date_count, cloc_line):
        records = cloc_line.split(',')

        if len(records) < 7:
            raise Exception('Cannot parse line "' + cloc_line + '"')

        number_of_files = records[0]
        language = records[1]
        number_of_blank_lines = records[2]
        lines_of_comments = records[3]
        lines_of_code = records[4]
        scale = records[5]
        third_gen = records[6]

        by_date_count.add_record(src_dir, language, lines_of_code)
        return by_date_count

    def parse(self, commit_date, commit_hash, src_directory_name, cloc_output):
        by_date_count = MetricsForCommit(commit_date, commit_hash)
        lines = cloc_output.split('\n')

        for line in lines:
            if 'files' in line:
                continue

            if line.isspace() or len(line) == 0:
                continue

            by_date_count = self.create_record(src_directory_name, by_date_count, line)

        return by_date_count


class TsvFormattingStore:

    def __init__(self):
        self.records_by_commit = {}

    def store(self, metrics_for_commit):
        commit = metrics_for_commit.commit

        if self.records_by_commit.has_key(commit):
            old_record = self.records_by_commit[commit]
            old_record.merge(metrics_for_commit)
        else:
            self.records_by_commit[commit] = metrics_for_commit


    def metrics_to_report(self):
        metrics_to_report = {}

        for record in self.records_by_commit.values():

            for src_dir in record.src_dirs.keys():
                metrics_for_dir = metrics_to_report.get(src_dir, set())

                for metric in record.src_dirs[src_dir].keys():
                    metrics_for_dir.add(metric)

                    metrics_to_report[src_dir] = metrics_for_dir

        return metrics_to_report

    def create_row_header(self, metrics_to_report):
        row_header = 'Date'
        for src_dir in metrics_to_report.keys():
            for language in metrics_to_report[src_dir]:
                row_header = row_header + '\t' + src_dir + '-' + language

        row_header = row_header + '\n'
        return row_header

    def as_csv(self):
        metrics_to_report = self.metrics_to_report()
        row_header = self.create_row_header(metrics_to_report)

        for record in self.records_by_commit.values():
            row_header = row_header + record.date
            for src_dir in metrics_to_report.keys():
                for metric in metrics_to_report[src_dir]:
                    row_header = row_header + '\t' + str(record.src_dirs.get(src_dir, {}).get(metric, 0))

            row_header = row_header + '\n'

        return row_header


class LinesOfCodeAnalyser:

    def __init__(self, abs_src_directory, running_from, data_store, parser = ClocParser(), executor = Executor()):
        self.executor = executor
        self.parser = parser
        self.running_from = running_from
        self.abs_src_directory = abs_src_directory
        self.data_store = data_store

    def analyse(self, commit_hash, commit_date):
        cloc_cmd = 'perl %s/tools/cloc-1.08.pl %s --csv --exclude-lang=CSS,HTML,XML --quiet' % (self.running_from, self.abs_src_directory)
        cloc_result = self.executor.execute(cloc_cmd)
        data_to_store = self.parser.parse(commit_date, commit_hash, self.abs_src_directory, cloc_result.read())
        self.data_store.store(data_to_store)


class CompositeAnalyser:

    def __init__(self, delegates):
        self.delegates = delegates

    def analyse(self, commit_hash, commit_date):
        for delegate in self.delegates:
            delegate.analyse(commit_hash, commit_date)

class MetricsForCommit:
    def __init__(self, date, commit):
        self.date = date
        self.commit = commit
        self.src_dirs = {}

    def add_record(self, src_dir, metric, count):
        counts_for_dir = self.src_dirs.get(src_dir, {})
        counts_for_dir[metric] = int(count)
        self.src_dirs[src_dir] = counts_for_dir

    def merge(self, other_by_date_count):
        if other_by_date_count.commit != self.commit:
            raise Exception('Can only merge records with same commit')

        for src_dir in other_by_date_count.src_dirs.keys():
            self.src_dirs[src_dir] = other_by_date_count.src_dirs[src_dir]


def generate_commit_list(location_for_files, git_repo):
    file_with_all_commits = location_for_files + "/commits.out"
    return git_repo.commits(file_with_all_commits)

def line_counts(location_for_results, sample_rate, src_dirs, git_dir, working_dir):
    data = open(location_for_results + "/line_count_by_time.tsv", 'w')

    git_repo = GitRepo(git_dir, working_dir, Executor())
    commit_list = generate_commit_list(location_for_results, git_repo)
    head = git_repo.current_head()

    store = TsvFormattingStore()
    delegate = CompositeAnalyser([LinesOfCodeAnalyser(src_dir, RUNNING_FROM, store) for src_dir in src_dirs])
    skipping_analyser = SkippingAnalyser(skipping_commits = sample_rate, delegate_analyser = delegate, git_repo = git_repo)

    for commit in commit_list:
        date = commit[1]
        git_commit = commit[0]
        skipping_analyser.analyse(git_commit, date)

    data.write(store.as_csv())

    print "Resetting to " + head
    git_repo.hard_reset(head)

    print data.name
    data.close()

def to_gnuplot(data_table):
    header_row = data_table.split('\n')[0]
    columns = header_row.split('\t')[1:]

    gnuplot = 'plot '
    count = 4 # The first 3 columns contain the date

    for column in columns:
        gnuplot = gnuplot + ('"line_count_by_time.tsv" using 1:%d title "%s", ' % (count, column))
        count = count + 1

    return gnuplot

def execution_path(filename):
  execution_path = os.path.join(os.path.dirname(inspect.getfile(sys._getframe(1))), 'run.sh')
  path_to_run = os.path.abspath(execution_path)

  if path_to_run.endswith('run.sh'):
      index_of_run = len(path_to_run) - 6
      path_to_run = path_to_run[:index_of_run]

  print "Using " + path_to_run
  return path_to_run

RUNNING_FROM =  execution_path('run.sh')

def pwd():
    return Executor().execute('pwd').read().strip()

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = OptionParser()
    parser.add_option("-r", "--results_dir", action="store", dest="result_dir", type="string", default=".", help="Location where results will be stored")
    parser.add_option("-s", "--source_dir", action="store", dest="src_dirs", type="string", default="src", help="A comma seperated list of directories to parse")
    parser.add_option("-f", "--frequency_of_sample", action="store", dest="sample_rate", default=100, type="int", help="How often should a sample be made")
    parser.add_option("-g", "--git_repo_dir", action="store", dest="git_repo_dir", default=pwd()+'/.git', type="string", help="The directory containing the .git file")
    parser.add_option("-w", "--working_dir", action="store", dest="working_dir", default=pwd(), type="string", help="Where will files be checked out to for line counts etc")

    (options, args) = parser.parse_args(argv)

    results_dir = options.result_dir
    sample_rate = options.sample_rate
    src_dirs_str = options.src_dirs
    git_dir = options.git_repo_dir
    working_dir = options.working_dir
    print "Using a sample rate of " + str(sample_rate) + " reading from files " + str(src_dirs_str)

    line_counts(results_dir, sample_rate, src_dirs_str.split(','), git_dir, working_dir)


if __name__ == "__main__":
    main()