From 46b74a49eed1966ae85a9f9a91908543a14031fc Mon Sep 17 00:00:00 2001 From: gmierz Date: Tue, 21 Jul 2020 11:34:48 -0400 Subject: [PATCH] [fenix] Update visual-metric code. --- .../visual-metrics/run-visual-metrics.py | 69 +++-- .../docker/visual-metrics/similarity.py | 291 ++++++++++++------ 2 files changed, 237 insertions(+), 123 deletions(-) diff --git a/taskcluster/docker/visual-metrics/run-visual-metrics.py b/taskcluster/docker/visual-metrics/run-visual-metrics.py index 4ae05172d3..14b15221f8 100644 --- a/taskcluster/docker/visual-metrics/run-visual-metrics.py +++ b/taskcluster/docker/visual-metrics/run-visual-metrics.py @@ -27,12 +27,16 @@ from voluptuous import ALLOW_EXTRA, Required, Schema #: The directory where artifacts from this job will be placed. OUTPUT_DIR = Path("/", "builds", "worker", "artifacts") + #: A job to process through visualmetrics.py @attr.s class Job: #: The name of the test. test_name = attr.ib(type=str) + #: The extra options for this job. + extra_options = attr.ib(type=str) + #: json_path: The path to the ``browsertime.json`` file on disk. json_path = attr.ib(type=Path) @@ -44,7 +48,11 @@ class Job: JOB_SCHEMA = Schema( { Required("jobs"): [ - {Required("test_name"): str, Required("browsertime_json_path"): str} + { + Required("test_name"): str, + Required("browsertime_json_path"): str, + Required("extra_options"): [str], + } ], Required("application"): {Required("name"): str, "version": str}, Required("extra_options"): [str], @@ -80,7 +88,7 @@ def run_command(log, cmd): return e.returncode, e.output -def append_result(log, suites, test_name, name, result): +def append_result(log, suites, test_name, name, result, extra_options): """Appends a ``name`` metrics result in the ``test_name`` suite. Args: @@ -98,10 +106,16 @@ def append_result(log, suites, test_name, name, result): log.error("Could not convert value", name=name) log.error("%s" % result) result = 0 - if test_name not in suites: - suites[test_name] = {"name": test_name, "subtests": {}} - subtests = suites[test_name]["subtests"] + if test_name in suites and suites[test_name]["extraOptions"] != extra_options: + missing = set(extra_options) - set(suites[test_name]["extraOptions"]) + test_name = test_name + "-".join(list(missing)) + + subtests = suites.setdefault( + test_name, + {"name": test_name, "subtests": {}, "extraOptions": extra_options} + )["subtests"] + if name not in subtests: subtests[name] = { "name": name, @@ -241,6 +255,8 @@ def main(log, args): jobs.append( Job( test_name=job["test_name"], + extra_options=len(job["extra_options"]) > 0 and + job["extra_options"] or jobs_json["extra_options"], json_path=browsertime_json_path, video_path=browsertime_json_path.parent / video, ) @@ -273,45 +289,34 @@ def main(log, args): # Python 3.5 requires a str object (not 3.6+) res = json.loads(res.decode("utf8")) for name, value in res.items(): - append_result(log, suites, job.test_name, name, value) + append_result(log, suites, job.test_name, name, value, job.extra_options) suites = [get_suite(suite) for suite in suites.values()] perf_data = { "framework": {"name": "browsertime"}, "application": jobs_json["application"], - "type": "vismet", + "type": "pageload", "suites": suites, } - for entry in suites: - entry["extraOptions"] = jobs_json["extra_options"] # Try to get the similarity for all possible tests, this means that we # will also get a comparison of recorded vs. live sites to check # the on-going quality of our recordings. - similarity = None - if "android" in os.getenv("TC_PLATFORM", ""): - try: - from similarity import calculate_similarity - similarity = calculate_similarity(jobs_json, fetch_dir, OUTPUT_DIR, log) - except Exception: - log.info("Failed to calculate similarity score", exc_info=True) - - if similarity: - suites[0]["subtests"].append({ - "name": "Similarity3D", - "value": similarity[0], - "replicates": [similarity[0]], - "lowerIsBetter": False, - "unit": "a.u.", - }) - suites[0]["subtests"].append({ - "name": "Similarity2D", - "value": similarity[1], - "replicates": [similarity[1]], - "lowerIsBetter": False, - "unit": "a.u.", - }) + try: + from similarity import calculate_similarity + for name, value in calculate_similarity(jobs_json, fetch_dir, OUTPUT_DIR).items(): + if value is None: + continue + suites[0]["subtests"].append({ + "name": name, + "value": value, + "replicates": [value], + "lowerIsBetter": False, + "unit": "a.u.", + }) + except Exception: + log.info("Failed to calculate similarity score", exc_info=True) # Validates the perf data complies with perfherder schema. # The perfherder schema uses jsonschema so we can't use voluptuous here. diff --git a/taskcluster/docker/visual-metrics/similarity.py b/taskcluster/docker/visual-metrics/similarity.py index 5820e531e0..f56e158758 100644 --- a/taskcluster/docker/visual-metrics/similarity.py +++ b/taskcluster/docker/visual-metrics/similarity.py @@ -10,6 +10,7 @@ import os import pathlib import shutil import socket +import structlog import tarfile import tempfile import urllib @@ -19,8 +20,24 @@ from matplotlib import pyplot as plt from scipy.stats import spearmanr -def open_data(file): - return cv2.VideoCapture(str(file)) +log = None + + +# We add the `and` conditions to it later +base_ad_query = { + "from": "task", + "limit": 1000, + "where": { + "and": [] + }, + "select": [ + "action.start_time", + "run.name", + "task.artifacts", + "task.group.id", + "task.id" + ], +} def socket_timeout(value=120): @@ -38,8 +55,12 @@ def socket_timeout(value=120): return _socket_timeout +def _open_data(file): + return cv2.VideoCapture(str(file)) + + @socket_timeout(120) -def query_activedata(query_json, log): +def _query_activedata(query_json): """Used to run queries on active data.""" active_data_url = "http://activedata.allizom.org/query" @@ -59,7 +80,7 @@ def query_activedata(query_json, log): @socket_timeout(120) -def download(url, loc, log): +def _download(url, loc): """Downloads from a url (with a timeout).""" log.info("Downloading %s" % url) try: @@ -70,7 +91,7 @@ def download(url, loc, log): return True -def get_frames(video): +def _get_frames(video): """Gets all frames from a video into a list.""" allframes = [] while video.isOpened(): @@ -84,77 +105,11 @@ def get_frames(video): return allframes -def calculate_similarity(jobs_json, fetch_dir, output, log): - """Calculates the similarity score against the last live site test. - - The technique works as follows: - 1. Get the last live site test. - 2. For each 15x15 video pairings, build a cross-correlation matrix: - 1. Get each of the videos and calculate their histograms - across the full videos. - 2. Calculate the correlation coefficient between these two. - 3. Average the cross-correlation matrix to obtain the score. - - The 2D similarity score is the same, except that it builds a histogram - from the final frame instead of the full video. - - For finding the last live site, we use active-data. We search for - PGO android builds since this metric is only available for live sites that - run on android in mozilla-cental. Given that live sites currently - run on cron 3 days a week, then it's also reasonable to look for tasks - which have occurred before today and within the last two weeks at most. - But this is a TODO for future work, since we need to determine a better - way of selecting the last task (HG push logs?) - there's a lot that factors - into these choices, so it might require a multi-faceted approach. - - Args: - jobs_json: The jobs JSON that holds extra information. - fetch_dir: The fetch directory that holds the new videos. - log: The logger. - Returns: - Two similarity scores (3D, 2D) as a float, or None if there was an issue. - """ - app = jobs_json["application"]["name"] - test = jobs_json["jobs"][0]["test_name"] - splittest = test.split("-cold") - - cold = "" - if len(splittest) > 0: - cold = ".*cold" - test = splittest[0] - - # PGO vs. OPT shouldn't matter much, but we restrict it to PGO builds here - # for android, and desktop tests have the opt/pgo restriction removed - plat = os.getenv("TC_PLATFORM", "") - if "android" in plat: - plat = plat.replace("/opt", "/pgo") - else: - plat = plat.replace("/opt", "").replace("/pgo", "") - ad_query = { - "from": "task", - "limit": 1000, - "where": { - "and": [ - { - "regexp": { - "run.name": ".*%s.*browsertime.*-live.*%s%s.*%s.*" - % (plat, app, cold, test) - } - }, - {"not": {"prefix": {"run.name": "test-vismet"}}}, - {"in": {"repo.branch.name": ["mozilla-central"]}}, - {"gte": {"action.start_time": {"date": "today-week-week"}}}, - {"lt": {"action.start_time": {"date": "today-1day"}}}, - {"in": {"task.run.state": ["completed"]}}, - ] - }, - "select": ["action.start_time", "run.name", "task.artifacts"], - } - - # Run the AD query and find the browsertime videos to download +def _get_browsertime_results(query): + """Used to run an AD query and extract the browsertime results if they exist.""" failed = False try: - data = query_activedata(ad_query, log) + data = _query_activedata(query) except Exception as e: log.info(str(e)) failed = True @@ -162,6 +117,7 @@ def calculate_similarity(jobs_json, fetch_dir, output, log): log.info("Couldn't get activedata data") return None + # Find the newest browsertime task log.info("Found %s datums" % str(len(data["action.start_time"]))) maxind = np.argmax([float(t) for t in data["action.start_time"]]) artifacts = data["task.artifacts"][maxind] @@ -171,13 +127,20 @@ def calculate_similarity(jobs_json, fetch_dir, output, log): btime_artifact = art["url"] break if not btime_artifact: - log.info("Can't find an older live site") + log.info("Can't find an older site test") return None + log.info("Comparing videos to TASK_GROUP=%s, TASK_ID=%s" % ( + data["task.group.id"][maxind], data["task.id"][maxind] + )) + # Download the browsertime videos and untar them tmpdir = tempfile.mkdtemp() loc = os.path.join(tmpdir, "tmpfile.tgz") - if not download(btime_artifact, loc, log): + if not _download(btime_artifact, loc): + log.info( + "Failed to download browsertime-results artifact from %s" % btime_artifact + ) return None tmploc = tempfile.mkdtemp() try: @@ -191,22 +154,90 @@ def calculate_similarity(jobs_json, fetch_dir, output, log): ) return None - # Find all the videos - oldmp4s = [str(f) for f in pathlib.Path(tmploc).rglob("*.mp4")] - log.info("Found %s old videos" % str(len(oldmp4s))) - newmp4s = [str(f) for f in pathlib.Path(fetch_dir).rglob("*.mp4")] - log.info("Found %s new videos" % str(len(newmp4s))) + return tmploc - # Finally, calculate the 2D/3D score + +def _data_from_last_task(label): + """Gets the data from the last PGO/OPT task with the same label. + + We look for both OPT and PGO tasks. The difference + between them should be minimal. This method also provides + a way to compare recordings from this task to another + known task based on the TC_GROUP_ID environment varible. + """ + label_opt = label.replace("/pgo", "/opt") + label_pgo = label.replace("/opt", "/pgo") + + base_ad_query["where"]["and"] = [ + {"in": {"task.run.state": ["completed"]}}, + {"or": [ + {"eq": {"run.name": label_pgo}}, + {"eq": {"run.name": label_opt}} + ]} + ] + + task_group_id = os.getenv("TC_GROUP_ID", "") + if task_group_id: + base_ad_query["where"]["and"].append( + {"eq": {"task.group.id": task_group_id}} + ) + else: + base_ad_query["where"]["and"].extend([ + {"in": {"repo.branch.name": ["mozilla-central"]}}, + {"gte": {"action.start_time": {"date": "today-week-week"}}}, + ]) + + return _get_browsertime_results(base_ad_query) + + +def _data_from_last_live_task(label): + """Gets the data from the last live site PGO task.""" + label_live = label.replace("/opt", "/pgo").replace("tp6m", "tp6m-live") + + base_ad_query["where"]["and"] = [ + {"in": {"repo.branch.name": ["mozilla-central"]}}, + {"gte": {"action.start_time": {"date": "today-week-week"}}}, + {"in": {"task.run.state": ["completed"]}}, + {"eq": {"run.name": label_live}}, + ] + + return _get_browsertime_results(base_ad_query) + + +def _get_similarity(old_videos_info, new_videos_info, output, prefix=""): + """Calculates a similarity score for two groupings of videos. + + The technique works as follows: + 1. Get the last live site test. + 2. For each 15x15 video pairings, build a cross-correlation matrix: + 1. Get each of the videos and calculate their histograms + across the full videos. + 2. Calculate the correlation coefficient between these two. + 3. Average the cross-correlation matrix to obtain the score. + + The 2D similarity score is the same, except that it builds a histogram + from the final frame instead of the full video. + + Args: + old_videos: List of old videos. + new_videos: List of new videos (from this task). + output: Location to output videos with low similarity scores. + prefix: Prefix a string to the output. + Returns: + Two similarity scores (3D, 2D) as a float. + """ nhists = [] nhists2d = [] - total_vids = min(len(oldmp4s), len(newmp4s)) + old_videos = [entry["data"] for entry in old_videos_info] + new_videos = [entry["data"] for entry in new_videos_info] + + total_vids = min(len(old_videos), len(new_videos)) xcorr = np.zeros((total_vids, total_vids)) xcorr2d = np.zeros((total_vids, total_vids)) for i in range(total_vids): - datao = np.asarray(get_frames(open_data(oldmp4s[i]))) + datao = np.asarray(_get_frames(old_videos[i])) histo, _, _ = plt.hist(datao.flatten(), bins=255) histo2d, _, _ = plt.hist(datao[-1, :, :].flatten(), bins=255) @@ -214,7 +245,7 @@ def calculate_similarity(jobs_json, fetch_dir, output, log): for j in range(total_vids): if i == 0: # Only calculate the histograms once; it takes time - datan = np.asarray(get_frames(open_data(newmp4s[j]))) + datan = np.asarray(_get_frames(new_videos[j])) histn, _, _ = plt.hist(datan.flatten(), bins=255) histn2d, _, _ = plt.hist(datan[-1, :, :].flatten(), bins=255) @@ -237,15 +268,93 @@ def calculate_similarity(jobs_json, fetch_dir, output, log): log.info("Average 3D similarity: %s" % str(np.round(similarity, 5))) log.info("Average 2D similarity: %s" % str(np.round(similarity2d, 5))) - if similarity < 0.5: - # For really low correlations, output the worst video pairing + if np.round(similarity, 1) <= 0.7 or np.round(similarity2d, 1) <= 0.7: + # For low correlations, output the worst video pairing # so that we can visually see what the issue was minind = np.unravel_index(np.argmin(xcorr, axis=None), xcorr.shape) - oldvid = oldmp4s[minind[0]] - shutil.copyfile(oldvid, str(pathlib.Path(output, "old_video.mp4"))) + oldvid = old_videos_info[minind[0]]["path"] + shutil.copyfile(oldvid, str(pathlib.Path(output, "%sold_video.mp4" % prefix))) - newvid = newmp4s[minind[1]] - shutil.copyfile(newvid, str(pathlib.Path(output, "new_video.mp4"))) + newvid = new_videos_info[minind[1]]["path"] + shutil.copyfile(newvid, str(pathlib.Path(output, "%snew_video.mp4" % prefix))) return np.round(similarity, 5), np.round(similarity2d, 5) + + +def calculate_similarity(jobs_json, fetch_dir, output): + """Calculates the similarity score for this task. + + Here we use activedata to find the last live site that ran and + to find the last task (with the same label) that ran. Those two + tasks are then compared to the current one and 4 metrics are produced. + + For live sites, we only calculate 2 of these metrics, since the + playback similarity is not applicable to it. + + Args: + jobs_json: The jobs JSON that holds extra information. + fetch_dir: The fetch directory that holds the new videos. + output: The output directory. + Returns: + A dictionary containing up to 4 different metrics (their values default + to None if a metric couldn't be calculated): + PlaybackSimilarity: Similarity of the full playback to a live site test. + PlaybackSimilarity2D: - // - (but for the final frame only) + Similarity: Similarity of the tests video recording to its last run. + Similarity2D: - // - (but for the final frame only) + """ + global log + log = structlog.get_logger() + + label = os.getenv("TC_LABEL", "") + if not label: + log.info("TC_LABEL is undefined, cannot calculate similarity metrics") + return {} + + # Get all the newest videos from this task + new_btime_videos = [ + {"data": _open_data(str(f)), "path": str(f)} + for f in pathlib.Path(fetch_dir).rglob("*.mp4") + ] + log.info("Found %s new videos" % str(len(new_btime_videos))) + + # Get the similarity against the last task + old_btime_res = _data_from_last_task(label) + old_sim = old_sim2d = None + if old_btime_res: + old_btime_videos = [ + {"data": _open_data(str(f)), "path": str(f)} + for f in pathlib.Path(old_btime_res).rglob("*.mp4") + ] + log.info("Found %s old videos" % str(len(old_btime_videos))) + + old_sim, old_sim2d = _get_similarity( + old_btime_videos, new_btime_videos, output + ) + else: + log.info("Failed to find an older test task") + + # Compare recordings to their live site variant if it exists + live_sim = live_sim2d = None + if "live" not in jobs_json["extra_options"]: + live_btime_res = _data_from_last_live_task(label) + if live_btime_res: + live_btime_videos = [ + {"data": _open_data(str(f)), "path": str(f)} + for f in pathlib.Path(live_btime_res).rglob("*.mp4") + ] + log.info("Found %s live videos" % str(len(live_btime_videos))) + + live_sim, live_sim2d = _get_similarity( + live_btime_videos, new_btime_videos, output, prefix="live_" + ) + else: + log.info("Failed to find a live site variant") + + return { + "PlaybackSimilarity": live_sim, + "PlaybackSimilarity2D": live_sim2d, + "Similarity": old_sim, + "Similarity2D": old_sim2d, + }