diff --git a/README.md b/README.md index 0eac12e..8cdf0af 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ### [gerrit-rechecks](gerrit-rechecks/README.md): Scripts to get the last comment in Gerrit by Data, Submit Changes to Gerrit Projects with "Depends-On" and Reverify Gerrit Changes. ### [EoD-stuff](EoD-stuff/README.md): Scripts for making life easier to the Engineer on Duty :) ### [provision](provision/README.md): Different implementation of ansible roles and playbooks to automate things. +### [similarity-comparison](similarity-comparison/README.md): Scripts for comparison of infrared based jenkins jobs Engineer on Duty - ![](https://github.com/RedHatCRE/toolbox/workflows/tests/badge.svg) diff --git a/similarity-comparison/.gitignore b/similarity-comparison/.gitignore new file mode 100644 index 0000000..513aea0 --- /dev/null +++ b/similarity-comparison/.gitignore @@ -0,0 +1,3 @@ +jjs.db +jjs.xlsx +venv/** diff --git a/similarity-comparison/README.md b/similarity-comparison/README.md new file mode 100644 index 0000000..fc854cf --- /dev/null +++ b/similarity-comparison/README.md @@ -0,0 +1,9 @@ +HOWTO +----- +virtualenv venv +. ./venv/bin/activate +pip install -r requirements.txt +python similarity-comparison.py + + + diff --git a/similarity-comparison/infrared_agrs_patch b/similarity-comparison/infrared_agrs_patch new file mode 100644 index 0000000..26c532c --- /dev/null +++ b/similarity-comparison/infrared_agrs_patch @@ -0,0 +1,36 @@ +diff --git a/infrared/api.py b/infrared/api.py +index e88b2949..6fc7f77a 100644 +--- a/infrared/api.py ++++ b/infrared/api.py +@@ -116,6 +116,12 @@ class InfraredPluginsSpec(SpecObject): + # unpack parsed arguments + nested_args, control_args, custom_args = parsed_args + ++ # print to stdout and serialize nested agruments. Finally, terminate. ++ import pickle ++ print (nested_args) ++ with open('nested_args.pkl', 'wb') as f: ++ pickle.dump(nested_args, f) ++ exit(0) + if control_args.get('debug', None): + logger.LOG.setLevel(logging.DEBUG) + +@@ -198,12 +204,18 @@ class SpecManager(object): + + def run_specs(self, args=None): + spec_args = vars(self.parser.parse_args(args)) ++ print(spec_args) + subcommand = spec_args.get('subcommand', '') + if not spec_args.get('no_log_commands'): + if self.execution_logger is None: + self.execution_logger = CoreServices.execution_logger_manager() + self.execution_logger.command() + ++ # serialize subcommand ++ import pickle ++ with open('subcommand.pkl', 'wb') as f: ++ pickle.dump(subcommand, f) ++ + if subcommand in self.spec_objects: + return self.spec_objects[subcommand].spec_handler( + self.parser, args=args) diff --git a/similarity-comparison/requirements.txt b/similarity-comparison/requirements.txt new file mode 100644 index 0000000..9162eaf --- /dev/null +++ b/similarity-comparison/requirements.txt @@ -0,0 +1,4 @@ +gitpython +requests +scikit-learn +xlsxwriter diff --git a/similarity-comparison/similarity_comparison.py b/similarity-comparison/similarity_comparison.py new file mode 100644 index 0000000..a786400 --- /dev/null +++ b/similarity-comparison/similarity_comparison.py @@ -0,0 +1,362 @@ +import configparser +import json +import logging +import os.path +import re +import requests +import sqlite3 +import subprocess +import sys +import xlsxwriter + +from git import Repo +from io import StringIO +from os.path import expanduser +from sklearn.feature_extraction.text import TfidfVectorizer + + +httpRequest = { + 'requestJobsAndBuildInfo': + "/api/json/?tree=jobs[name,lastBuild[result,number,timestamp]]", + 'requestJobs': + "/api/json?tree=jobs[name]", + 'requestStableBuildArtifact': + "/job/{jobName}/lastStableBuild/artifact/{artifactPath}", + 'requestArtifact': + "/job/{jobName}/lastSuccessfulBuild/artifact/{artifactPath}" +} + + +def get_base_prefix_compat(): + """Get base/real prefix, or sys.prefix if there is none.""" + return getattr(sys, "base_prefix", None) or getattr(sys, "real_prefix", + None) or sys.prefix + +def in_virtualenv(): + return get_base_prefix_compat() != sys.prefix + +# JJSC - Jenkins Jobs Similarity Computation +class JJSC(object): + def __init__(self, credentialsPath, artifactPath): + configParser = configparser.RawConfigParser() + print(configParser.read(credentialsPath)) + sectionName = "jenkins" + dictionary = dict(configParser.items(sectionName)) + + self.url = dictionary['url'] + self.artifactPath = artifactPath + self.credentials = (dictionary['user'], dictionary['password']) + + # create (if !exists) a db to store + self.dbcon = sqlite3.connect('jjs.db') + print("Connected to SQLite jjs.db") + cursor = self.dbcon.cursor() + cursor.execute('''CREATE TABLE IF NOT EXISTS jjs + ( jobName text, + artifatcContent text, + artifactCtntNrmlzd text )''') + self.dbcon.commit() + cursor.close() + print("jjs table exists in jjs.db") + + self.workbook = xlsxwriter.Workbook('jjs.xlsx') + + logging.basicConfig(filename='sc.log', + level=logging.WARNING, + format='%(levelname)s %(asctime)s\n%(message)s\n', + datefmt='%Y-%m-%d %H:%M:%S') + + def __del__(self): + if self.dbcon: + self.dbcon.close() + print("The SQLite connection is closed") + self.workbook.close() + + def _prepare_arg_parsing_and_serialization(self): + # clone infrared + git_url = "https://github.com/redhat-openstack/infrared.git" + repo_dir = "/tmp/infrared" + + if os.path.exists(repo_dir): + return + subprocess.call("rm -rf " + repo_dir, shell=True) + Repo.clone_from(git_url, repo_dir) + + # apply the arg serialization patch + command = "cp infrared_agrs_patch " + repo_dir + ";" + \ + "cd " + repo_dir + ";" + \ + "git apply infrared_agrs_patch" + subprocess.call(command, shell=True) + + #install infarred in a virtual environment + if (not in_virtualenv()): + raise Exception("This code installs pip packages and is " + \ + "adviced to be executed in a virtual environment") + + command = "cd " + repo_dir + ";" + \ + "pip install - U pip;" + \ + "pip install ." + subprocess.call(command, shell=True) + + # add additional plugins for enhanced parsing + subprocess.call("infrared plugin add all", shell=True) + + def _extract_ir_commands(self, file_context: str): + i = 0 + extracts = [] + REGEXP_START = r"\s*(infrared|ir)" + pattern_start = re.compile(REGEXP_START) + + # reformat file content to un-split multiline bash commands + file = StringIO(file_context.replace("\\\n", " ")) + + for line in file: + i += 1 # line counting starts with 1 + if pattern_start.match(line): + extracts.append((i, line.rstrip('\n'))) + + for line in extracts: + + status, output = subprocess.getstatusoutput(line[1]) + output = line[1].strip() + "\n" + output + print (output) + if status != 0: + logging.warning(output) + def _print_parsed_paramters(self): + # fetch unified jobs + sql_command = \ + 'SELECT DISTINCT * FROM jjs WHERE jobName LIKE ' + \ + '\'%DFG%\' AND jobName LIKE \'%unified%\' ORDER BY jobName' + unifiedJobs = self._fetch_jobs_from_DB(sql_command) + print("Total of unified jobs are: ", len(unifiedJobs)) + + for rowUnified in unifiedJobs: + jobNameUnified = str(rowUnified[0]) + print(len(unifiedJobs)) + + releaseUnified = self._extractVersionFromJobName( + jobNameUnified) + ipVersionUnifed = self._extractIPVersionFromJobName( + jobNameUnified) + + file_content = str(rowUnified[1]) + + print(self._extract_ir_commands(file_content)) + + + def _insertDataIntoTable(self, jobName, artifatcContent): + try: + cursor = self.dbcon.cursor() + sqlite_insert_with_param = """INSERT INTO jjs + (jobName, artifatcContent) + VALUES (?, ?);""" + data_tuple = (jobName, artifatcContent) + cursor.execute(sqlite_insert_with_param, data_tuple) + self.dbcon.commit() + cursor.close() + return 0 + + except sqlite3.Error as error: + print("Failed to insert into sqlite table", error) + return -1 + + def populateDB(self): + # get all Jobs + request = requests.get(self.url + httpRequest['requestJobs'], + verify=False, + auth=self.credentials) + jobsInJSON = json.loads(request.text) + print(json.dumps(jobsInJSON, indent=4, sort_keys=True)) + + skipList = ["util"] + + # get and store an artifact (if found) + okCounter = 0 + insertCounter = 0 + for element in jobsInJSON['jobs']: + print(element['name']) + jobName = element['name'] + if jobName in skipList: + continue + requestStr = self.url + httpRequest['requestArtifact'].format( + jobName=jobName, + artifactPath=self.artifactPath) + request = requests.get(requestStr, verify=False, + auth=self.credentials) + print(requestStr) + if request.ok: + okCounter = okCounter + 1 + if self._insertDataIntoTable(jobName, request.text) >= 0: + insertCounter = insertCounter + 1 + + print("From populateDB") + print("okCounter: " + str(okCounter)) + print("insertCounter: " + str(insertCounter)) + print("number of jobs: " + str(len(jobsInJSON['jobs']))) + assert (okCounter == insertCounter) + + def _normilizeArtifact(self, artifact): + regex = r".*infrared (tripleo-undercloud|tripleo-overcloud) .*\\*" + plugin_names = "(tripleo-undercloud|tripleo-overcloud)" + regex = r".*infrared " + plugin_names + " .*(([\r\n]*).*){4}" + matches = re.finditer(regex, artifact, re.MULTILINE) + normalizedArtifact = "" + for matchNum, match in enumerate(matches, start=1): + print( + "Match {matchNum} was found at {start}-{end}: {match}".format( + matchNum=matchNum, + start=match.start(), + end=match.end(), + match=match.group())) + normalizedArtifact = normalizedArtifact + "\n" + match.group() + + # TODO: filter out tempest invocation - DONE + return (normalizedArtifact) + + def _extractVersionFromJobName(self, jobName): + # matches XY.Z XY XY_Z in job names + REGEXP = r'\s*([\d(.|_)]+)(_compact|-compact|_director|-director)\s*' + + version = re.search(REGEXP, jobName).group(1) + version = version.replace("_", ".") # for jobs with XY_Z + + return version + + def _extractIPVersionFromJobName(self, jobName): + # matches XY.Z XY XY_Z in job names + REGEXP = r".*ipv([\d]+).*" + + try: + version = re.search(REGEXP, jobName).group(1) + except AttributeError: + version = "NA" + + return version + + # return true if artifact contains any of filter out criteria + def _isFilteredOut(self, articact): + filter = ["infrared tripleo-inventory", + "infrared workspace import", + "sshpass -p stack ssh -o UserKnownHostsFile=/dev/null", + "infrared tripleo-upgrade"] + + articactString = str(articact) + + intersestoin = [value for value in filter if value in articactString] + + return (len(intersestoin) > 0) + + def _fetch_jobs_from_DB(self, sql_command): + cursor = self.dbcon.cursor() + cursor.execute(sql_command) + jobs = cursor.fetchall() + cursor.close() + return jobs + + def analyseJJSTable(self): + # fetch unified jobs + sql_command = \ + 'SELECT DISTINCT * FROM jjs WHERE jobName LIKE ' + \ + '\'%unified%\' AND jobName LIKE \'%director%\' ORDER BY jobName' + unifiedJobs = self._fetch_jobs_from_DB(sql_command) + print("Total of unified jobs are: ", len(unifiedJobs)) + + # fetch other director jobs (including unified ones) to compare + # against the unified jobs + sql_command = \ + 'SELECT DISTINCT * FROM jjs WHERE jobName LIKE ' + \ + '\'%director%\' AND jobName NOT LIKE \'%compact%\'' + directorJobs = self._fetch_jobs_from_DB(sql_command) + print("Total of director jobs are: ", len(directorJobs)) + + unifiedJobsCounter = 0 + cell_format = self.workbook.add_format( + {'bold': True, 'font_color': 'red'}) + for rowUnified in unifiedJobs: + jobNameUnified = str(rowUnified[0]) + print(len(unifiedJobs)) + try: + unifiedJobsCounter += 1 + worksheet = self.workbook.add_worksheet( + jobNameUnified[1:28] + "--" + str(unifiedJobsCounter)) + worksheet.set_column(0, 0, len(jobNameUnified)) + worksheet.write(0, 0, jobNameUnified, cell_format) + row = 1 + except xlsxwriter.exceptions.DuplicateWorksheetName: + continue + for rowDirector in directorJobs: + jobNameDirector = str(rowDirector[0]) + releaseUnified = self._extractVersionFromJobName( + jobNameUnified) + releaseDirector = self._extractVersionFromJobName( + jobNameDirector) + ipVersionUnifed = self._extractIPVersionFromJobName( + jobNameUnified) + ipVersionDirector = self._extractIPVersionFromJobName( + jobNameDirector) + # if releaseUnified not in ["16.1", "16.2"]: + # continue + + if jobNameUnified != jobNameDirector and \ + releaseUnified == releaseDirector and \ + ipVersionUnifed == ipVersionDirector: + artifactUnified = str(rowUnified[1]) + artifactDirector = str(rowDirector[1]) + if self._isFilteredOut(artifactDirector): + continue + normalizedUnified = self._normilizeArtifact( + artifactUnified) + normalizedDirector = self._normilizeArtifact( + artifactDirector) + try: + tfidf = TfidfVectorizer().fit_transform( + [normalizedUnified, normalizedDirector]) + # no need to normalize, since Vectorizer will return + # normalized tf-idf + pairwise_similarity = tfidf * tfidf.T + except Exception: + print("Can not compare " + rowUnified[0] + " and " + + rowDirector[0] + "\n") + threshold = pairwise_similarity.data.min() + + if threshold >= 0.0: + wordsUnified = set(normalizedUnified.split()) + wordsDirector = set(normalizedDirector.split()) + unifiedUniques = set( + sorted(wordsUnified.difference(wordsDirector))) + directorUniques = set( + sorted(wordsDirector.difference(wordsUnified))) + uniques = unifiedUniques.union(directorUniques) + print(jobNameUnified + "," + str(unifiedUniques)) + print(jobNameDirector + "," + str(directorUniques)) + fstr = 'Total uniques: {}, Pairwise Similarity: {}\n' + print(fstr.format(len(uniques), threshold)) + try: + worksheet.set_column(row, 0, len(jobNameDirector)) + worksheet.write(row, 0, jobNameDirector) + + threshold = round(threshold, 3) + worksheet.set_column(row, 1, len(str(threshold))) + worksheet.write(row, 1, str(threshold)) + + row = row + 1 + except Exception as e: + print(e) + continue + + + +credentialsPath = expanduser("~") + '/.config/jenkins_jobs/jenkins_jobs.ini' +artifactPath = '.sh/run.sh' +jjsc = JJSC(credentialsPath, artifactPath) + + + +#jjsc.populateDB() +#jjsc.analyseJJSTable() + +jjsc._prepare_arg_parsing_and_serialization() +jjsc._print_parsed_paramters() + + +del jjsc