import json import os import subprocess import time class Algorithm(): def __init__(self, data_cls, name, path): self.path = path self.data_cls = data_cls self.name = name if data_cls.validate_versions: try: with subprocess.Popen([self.path, "-v"], stdout=subprocess.PIPE) as proc: self.current_version = int(proc.stdout.read()) except FileNotFoundError: self.current_version = None self.data = [] class Run: def __init__(self, data_cls, json): self.data_cls = data_cls algo = json["algo"].split("/")[-1] if algo not in data_cls.algorithms: data_cls.algorithms[algo] = Algorithm(data_cls, algo, json["algo"]) self.algo = data_cls.algorithms[algo] self.algo_version = json["version"] self.args = json["args"] if "args" in json else [] self.n = json["n"] self.error = json.get("error", None) self.score = self.n if self.error else json["score"] self.data = json.get("data", {}) resources = json.get("resources", {}) self.resources_memory_kb = resources.get("memory_kb", None) self.resources_cpu_time_s = resources.get("cpu_time_s", None) self.resources_wallclock_s = resources.get("wallclock_s", None) self.mistakes = json["mistakes"] if "mistakes" in json else None if "from" in json: self._from = Run(data_cls, json["from"]) self.seed = self._from.seed else: self._from = None self.seed = json["seed"] def print_pipeline(self, versions=False): x = "" if self._from: x += self._from.print_pipeline(versions=versions) + "|" x += self.algo.name if versions: x += f":{self.algo_version}" if len(self.args): x += "(" for arg in self.args[:-1]: x += f"{arg}, " x += f"{self.args[-1]})" return x def is_up_to_date(self): if not self.data_cls.validate_versions: return True if self._from and not self._from.is_up_to_date(): return False return self.algo_version == self.algo.current_version class Data: def __init__(self, logfile=None, validate_versions=True): self.validate_versions = validate_versions logfile = logfile or os.environ.get('LOGFILE', 'log') raw_json = "[" + open(logfile,mode='r').read()[0:-2] + "]" raw_data = json.loads(raw_json) self.algorithms = {} self.pipelines = {} self.runs = [] for it in raw_data: run = Run(self, it) self.runs.append(run) if not validate_versions or run.is_up_to_date(): self.pipelines.setdefault(run.print_pipeline(), []) self.pipelines[run.print_pipeline()].append(run) def group_by_n(arr): by_n = {} for i in arr: by_n.setdefault(i.n, []) by_n[i.n].append(i) return by_n