centml.compile + performance prediction backend (#70)

yaxan · Ubuntu · web-flow · commit 0054da563d6f · 2024-08-23T16:47:30.000-04:00
* Implements centml.compile module for remote compilation or prediction
* New centml_prediction_backend for predicting inference time and exporting to prometheus
* New class for profiling / stepping through graph module
* Adds scripts for prediction data collection and a sample script for running prediction workflow
* Adds sample prediction data for A10G and A100


---------

Co-authored-by: Ubuntu &lt;ubuntu@ip-172-31-46-81.us-east-2.compute.internal&gt;
diff --git a/centml/__init__.py b/centml/__init__.py
@@ -0,0 +1,3 @@
+from .compile import compile
+
+__all__ = ["compile"]
diff --git a/centml/compile.py b/centml/compile.py
@@ -0,0 +1,56 @@
+import builtins
+from typing import Callable, Dict, Optional, Union
+
+import torch
+
+from centml.compiler.backend import centml_dynamo_backend
+from centml.compiler.config import OperationMode, settings
+from centml.compiler.prediction.backend import centml_prediction_backend, get_gauge
+
+
+def compile(
+    model: Optional[Callable] = None,
+    *,
+    fullgraph: builtins.bool = False,
+    dynamic: Optional[builtins.bool] = None,
+    mode: Union[str, None] = None,
+    options: Optional[Dict[str, Union[str, builtins.int, builtins.bool]]] = None,
+    disable: builtins.bool = False,
+) -> Callable:
+
+    if settings.CENTML_MODE == OperationMode.REMOTE_COMPILATION:
+        # Return the remote-compiled model
+        compiled_model = torch.compile(
+            model,
+            backend=centml_dynamo_backend,  # Compilation backend
+            fullgraph=fullgraph,
+            dynamic=dynamic,
+            mode=mode,
+            options=options,
+            disable=disable,
+        )
+        return compiled_model
+    elif settings.CENTML_MODE == OperationMode.PREDICTION:
+        # Proceed with prediction workflow
+        compiled_model = torch.compile(
+            model,
+            backend=centml_prediction_backend,  # Prediction backend
+            fullgraph=fullgraph,
+            dynamic=dynamic,
+            mode=mode,
+            options=options,
+            disable=disable,
+        )
+
+        def centml_wrapper(*args, **kwargs):
+            out = compiled_model(*args, **kwargs)
+            # Update the prometheus metrics with final values
+            gauge = get_gauge()
+            for gpu in settings.CENTML_PREDICTION_GPUS.split(','):
+                gauge.set_metric_value(gpu)
+
+            return out
+
+        return centml_wrapper
+    else:
+        raise Exception("Invalid operation mode")
diff --git a/centml/compiler/config.py b/centml/compiler/config.py
@@ -9,6 +9,11 @@ class CompilationStatus(Enum):
     DONE = "DONE"
 
 
+class OperationMode(Enum):
+    PREDICTION = "PREDICTION"
+    REMOTE_COMPILATION = "REMOTE_COMPILATION"
+
+
 class Config(BaseSettings):
     CENTML_COMPILER_TIMEOUT: int = 10
     CENTML_COMPILER_MAX_RETRIES: int = 3
@@ -31,5 +36,10 @@ class Config(BaseSettings):
     # If the server response is smaller than this, don't gzip it
     CENTML_MINIMUM_GZIP_SIZE: int = 1000
 
+    CENTML_MODE: OperationMode = OperationMode.REMOTE_COMPILATION
+    CENTML_PREDICTION_DATA_FILE: str = 'tests/sample_data.csv'
+    CENTML_PREDICTION_GPUS: str = "A10G,A100SXM440GB"
+    CENTML_PROMETHEUS_PORT: int = 8000
+
 
 settings = Config()
diff --git a/centml/compiler/prediction/backend.py b/centml/compiler/prediction/backend.py
@@ -0,0 +1,28 @@
+from typing import List
+
+import torch
+from torch._subclasses.fake_tensor import FakeTensorMode
+
+from centml.compiler.config import settings
+from centml.compiler.prediction.kdtree import get_tree_db
+from centml.compiler.prediction.metric import get_gauge
+from centml.compiler.prediction.profiler import Profiler
+
+
+def centml_prediction_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+    profilers = []
+    tree_db = get_tree_db()
+    for gpu in settings.CENTML_PREDICTION_GPUS.split(','):
+        profilers.append(Profiler(gm, gpu, tree_db))
+
+    def forward(*args):
+        fake_mode = FakeTensorMode(allow_non_fake_inputs=True)
+        fake_args = [fake_mode.from_tensor(arg) if isinstance(arg, torch.Tensor) else arg for arg in args]
+        with fake_mode:
+            for prof in profilers:
+                out, t = prof.propagate(*fake_args)
+                gauge = get_gauge()
+                gauge.increment(prof.gpu, t)
+        return out
+
+    return forward
diff --git a/centml/compiler/prediction/kdtree.py b/centml/compiler/prediction/kdtree.py
@@ -0,0 +1,69 @@
+import ast
+import csv
+import logging
+
+from sklearn.neighbors import KDTree  # type: ignore
+
+from centml.compiler.config import settings
+
+_tree_db = None
+
+
+class KDTreeWithValues:
+    def __init__(self, points=None, values=None):
+        self.points = points if points else []
+        self.values = values if values else []
+        if self.points:
+            self.tree = KDTree(self.points)
+        else:
+            self.tree = None
+
+    def add(self, point, value):
+        self.points.append(point)
+        self.values.append(value)
+        self.tree = KDTree(self.points)
+
+    def query(self, point):
+        if self.tree is None:
+            return None, None
+
+        dist, idx = self.tree.query([point], k=1)
+        return dist[0][0], self.values[idx[0][0]]
+
+
+class TreeDB:
+    def __init__(self, data_csv):
+        self.db = {}
+        self._populate_db(data_csv)
+
+    def get(self, key, inp):
+        if key not in self.db:
+            logging.getLogger(__name__).warning(f"Key {key} not found in database")
+            return float('-inf')
+            # TODO: Handle the case of unfound keys better. For now, return -inf to indicate something went wrong.
+            # Ideally, we shouldn't throw away a whole prediction because of one possibly insignificant node.
+
+        _, val = self.db[key].query(inp)
+        return val
+
+    def _add_from_db(self, key, points, values):
+        self.db[key] = KDTreeWithValues(points, values)
+
+    def _populate_db(self, data_csv):
+        with open(data_csv, newline='') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                try:
+                    key = (row['op'], int(row['dim']), row['inp_dtypes'], row['out_dtypes'], row['gpu'])
+                    points = ast.literal_eval(row['points'])
+                    values = ast.literal_eval(row['values'])
+                    self._add_from_db(key, points, values)
+                except ValueError as e:
+                    logging.getLogger(__name__).exception(f"Error parsing row: {row}\n{e}")
+
+
+def get_tree_db():
+    global _tree_db
+    if _tree_db is None:
+        _tree_db = TreeDB(settings.CENTML_PREDICTION_DATA_FILE)
+    return _tree_db
diff --git a/centml/compiler/prediction/metric.py b/centml/compiler/prediction/metric.py
@@ -0,0 +1,30 @@
+import time
+
+from prometheus_client import Gauge, start_http_server
+
+from centml.compiler.config import settings
+
+_gauge = None
+
+
+def get_gauge():
+    global _gauge
+    if _gauge is None:
+        _gauge = GaugeMetric()
+    return _gauge
+
+
+class GaugeMetric:
+    def __init__(self):
+        start_http_server(settings.CENTML_PROMETHEUS_PORT)
+        self._gauge = Gauge('execution_time_microseconds', 'Kernel execution times by GPU', ['gpu', 'timestamp'])
+        self._values = {}
+
+    def increment(self, gpu_name, value):
+        if gpu_name not in self._values:
+            self._values[gpu_name] = 0
+        self._values[gpu_name] += value
+
+    def set_metric_value(self, gpu_name):
+        self._gauge.labels(gpu=gpu_name, timestamp=time.time()).set(self._values[gpu_name])
+        self._values[gpu_name] = 0
diff --git a/centml/compiler/prediction/profiler.py b/centml/compiler/prediction/profiler.py
@@ -0,0 +1,153 @@
+from typing import Dict
+
+import torch
+import torch.fx
+from torch.fx.node import Node
+
+
+class Profiler:
+    def __init__(self, mod, gpu, treeDB, data_collection_mode=False):
+        self.mod = mod
+        self.graph = mod.graph
+        self.modules = dict(self.mod.named_modules())
+        self.tree_db = treeDB
+        self.gpu = gpu
+        self.data_collection_mode = data_collection_mode
+
+    def propagate(self, *args):
+        args_iter = iter(args)
+        env: Dict[str, Node] = {}
+        total_time = 0
+
+        def load_arg(a):
+            return torch.fx.graph.map_arg(a, lambda n: env[n.name])
+
+        def fetch_attr(target: str):
+            target_atoms = target.split('.')
+            attr_itr = self.mod
+            for i, atom in enumerate(target_atoms):
+                if not hasattr(attr_itr, atom):
+                    raise RuntimeError(f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}")
+                attr_itr = getattr(attr_itr, atom)
+            return attr_itr
+
+        def get_flattened_shapes(args):
+            flattened_shapes = []
+            dtypes = []
+
+            for arg in args:
+                if isinstance(arg, (tuple, list)):
+                    if len(arg) > 0 and isinstance(arg[0], (tuple, list, torch.Tensor)):
+                        nested_shapes, nested_dtypes = get_flattened_shapes(arg[0])
+                        shape = [len(arg)] + nested_shapes
+                        dtypes.extend(nested_dtypes.split(','))
+                    else:
+                        shape = [len(arg)]
+                elif isinstance(arg, torch.Tensor):
+                    shape = list(arg.shape)
+                    dtypes.append(str(arg.dtype))
+                elif isinstance(arg, bool):
+                    shape = [1 if arg is True else 0]
+                elif isinstance(arg, (int, float)):
+                    shape = [arg]
+                else:
+                    shape = [1]
+                flattened_shapes.extend(shape)
+
+            if len(flattened_shapes) < 2:
+                flattened_shapes.extend([1])
+
+            input_dtypes = ','.join(dtypes) if dtypes else 'N/A'
+
+            return flattened_shapes, input_dtypes
+
+        def get_output_dtypes(results):
+            def find_dtypes(results):
+                if isinstance(results, torch.Tensor):
+                    return [str(results.dtype)]
+                if isinstance(results, (list, tuple)):
+                    dtypes = []
+                    for item in results:
+                        dtypes.extend(find_dtypes(item))
+                    return dtypes
+                return []
+
+            types = find_dtypes(results)
+
+            if types:
+                return ','.join(types)
+            return 'N/A'
+
+        def get_time_or_profile(key, inp_shapes, operation, *args, **kwargs):
+            t = self.tree_db.get(key, inp_shapes)
+
+            if self.data_collection_mode and t is None:
+                with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
+                    result = operation(*args, **kwargs)
+                event_time_total = 0
+                for event in prof.key_averages():
+                    event_time_total += event.cuda_time_total
+                t = event_time_total
+                self.tree_db.add(key, inp_shapes, t)
+
+            return t
+
+        for node in self.graph.nodes:
+            result = None
+            if node.op == 'placeholder':
+                result = next(args_iter)
+            elif node.op == 'get_attr':
+                result = fetch_attr(node.target)
+            elif node.op == 'call_function':
+                args = load_arg(node.args)
+                kwargs = load_arg(node.kwargs)
+                result = node.target(*args, **kwargs)
+
+                inp_shapes, input_dtypes = get_flattened_shapes(args)
+                output_dtypes = get_output_dtypes(result)
+
+                key = (node.target.__name__, len(inp_shapes), input_dtypes, output_dtypes, self.gpu)
+
+                t = get_time_or_profile(key, inp_shapes, node.target, *args, **kwargs)
+
+                total_time += t
+            elif node.op == 'call_method':
+                self_obj, *args = load_arg(node.args)
+                kwargs = load_arg(node.kwargs)
+                result = getattr(self_obj, node.target)(*args, **kwargs)
+
+                inp_shapes, input_dtypes = get_flattened_shapes(args)
+                output_dtypes = get_output_dtypes(result)
+
+                key = (node.target, len(inp_shapes), input_dtypes, output_dtypes, self.gpu)
+
+                t = get_time_or_profile(key, inp_shapes, getattr(self_obj, node.target), *args, **kwargs)
+
+                total_time += t
+            elif node.op == 'call_module':
+                mod = self.modules[node.target]
+                args = load_arg(node.args)
+                kwargs = load_arg(node.kwargs)
+                result = mod(*args, **kwargs)
+
+                inp_shapes, input_dtypes = get_flattened_shapes(args)
+
+                param_shapes = [param.shape for name, param in mod.named_parameters()]
+                param_dtypes = [str(param.dtype) for name, param in mod.named_parameters()]
+                flattened_params = [dim for shape in param_shapes for dim in shape]
+
+                inp_shapes = inp_shapes + flattened_params
+                input_dtypes = input_dtypes + ',' + ','.join(param_dtypes)
+
+                output_dtypes = get_output_dtypes(result)
+
+                key = (mod._get_name(), len(inp_shapes), input_dtypes, output_dtypes, self.gpu)
+
+                t = get_time_or_profile(key, inp_shapes, mod, *args, **kwargs)
+
+                total_time += t
+            elif node.op == 'output':
+                args = load_arg(node.args)
+                return args[0], total_time
+
+            env[node.name] = result
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -11,3 +11,5 @@ parameterized>=0.9.0
 mypy==1.5.1
 types-requests==2.31.0.2
 types-tabulate>=0.9.0
+prometheus-client>=0.20.0
+
diff --git a/requirements.txt b/requirements.txt
@@ -7,5 +7,7 @@ Requests==2.32.2
 tabulate>=0.9.0
 pyjwt>=2.8.0
 cryptography==42.0.8
+prometheus-client>=0.20.0
 scipy>=1.6.0
+scikit-learn>=1.5.1
 platform_api_client @ git+https://github.com/CentML/platform_api_python_client.git@main
diff --git a/scripts/data_collection.py b/scripts/data_collection.py
diff --git a/scripts/prediction_sample.py b/scripts/prediction_sample.py
diff --git a/tests/sample_data.csv b/tests/sample_data.csv

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .compile import compile`
	`2`	`+`
	`3`	`+__all__ = ["compile"]`