-
Notifications
You must be signed in to change notification settings - Fork 49
Use tools-API in qualx predict #1838
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -13,12 +13,15 @@ | |||||||||||||||
| # limitations under the License. | ||||||||||||||||
|
|
||||||||||||||||
| """module that defines the app descriptor for the results loaded by the tools.""" | ||||||||||||||||
|
|
||||||||||||||||
| import re | ||||||||||||||||
| from dataclasses import dataclass, field | ||||||||||||||||
| from functools import cached_property | ||||||||||||||||
| from typing import Optional | ||||||||||||||||
| from typing import Optional, List, Dict | ||||||||||||||||
|
|
||||||||||||||||
| import pandas as pd | ||||||||||||||||
| from pydantic.alias_generators import to_camel | ||||||||||||||||
|
|
||||||||||||||||
| from spark_rapids_tools.utils import Utilities | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| @dataclass | ||||||||||||||||
|
|
@@ -32,6 +35,56 @@ class AppHandler(object): | |||||||||||||||
| # this will be loaded from the core-status csv report | ||||||||||||||||
| eventlog_path: Optional[str] = None | ||||||||||||||||
|
|
||||||||||||||||
| @staticmethod | ||||||||||||||||
| def get_pd_dtypes() -> Dict[str, str]: | ||||||||||||||||
| """ | ||||||||||||||||
| Get the pandas data types for the AppHandler attributes. | ||||||||||||||||
| :return: Dictionary mapping attribute names to pandas data types. | ||||||||||||||||
| """ | ||||||||||||||||
| return { | ||||||||||||||||
| 'app_id': Utilities.scala_to_pandas_type('String'), | ||||||||||||||||
| 'attempt_id': Utilities.scala_to_pandas_type('Int'), | ||||||||||||||||
| 'app_name': Utilities.scala_to_pandas_type('String'), | ||||||||||||||||
| 'eventlog_path': Utilities.scala_to_pandas_type('String') | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| @staticmethod | ||||||||||||||||
| def normalize_attribute(arg_value: str) -> str: | ||||||||||||||||
| """ | ||||||||||||||||
| Normalize the attribute name to a plain format. | ||||||||||||||||
| It uses re.sub to replace any '-' or '_' with a space using the regexp 'r"(_|-)+"'. | ||||||||||||||||
| Finally, it uses str.replace() to remove any spaces. | ||||||||||||||||
| :param arg_value: the attribute name to normalize. | ||||||||||||||||
| :return: the actual field name that is used in the AppHandler. | ||||||||||||||||
| """ | ||||||||||||||||
| processed_value = re.sub(r'([_\-])+', ' ', arg_value.strip().lower()).replace(' ', '') | ||||||||||||||||
| lookup_map = { | ||||||||||||||||
| 'appname': 'app_name', | ||||||||||||||||
| 'appid': 'app_id', | ||||||||||||||||
| 'attemptid': 'attempt_id', | ||||||||||||||||
| 'eventlogpath': 'eventlog_path' | ||||||||||||||||
| } | ||||||||||||||||
| return lookup_map.get(processed_value, arg_value) | ||||||||||||||||
|
|
||||||||||||||||
| @classmethod | ||||||||||||||||
| def get_key_attributes(cls) -> List[str]: | ||||||||||||||||
| """ | ||||||||||||||||
| Get the key attributes that define an AppHandler. | ||||||||||||||||
| :return: List of key attributes. | ||||||||||||||||
| """ | ||||||||||||||||
| return ['app_id'] | ||||||||||||||||
|
|
||||||||||||||||
| @classmethod | ||||||||||||||||
| def get_default_key_columns(cls) -> Dict[str, str]: | ||||||||||||||||
| """ | ||||||||||||||||
| Get the default key columns for the AppHandler. | ||||||||||||||||
| :return: Dictionary mapping attribute names to column names. | ||||||||||||||||
| """ | ||||||||||||||||
| res = {} | ||||||||||||||||
| for attr in cls.get_key_attributes(): | ||||||||||||||||
| res[attr] = to_camel(attr) | ||||||||||||||||
| return res | ||||||||||||||||
|
|
||||||||||||||||
| def is_name_defined(self) -> bool: | ||||||||||||||||
| """ | ||||||||||||||||
| Check if the app name is defined. | ||||||||||||||||
|
|
@@ -57,17 +110,37 @@ def uuid(self) -> str: | |||||||||||||||
| """ | ||||||||||||||||
| return self._app_id | ||||||||||||||||
|
|
||||||||||||||||
| def patch_into_df(self, df: pd.DataFrame) -> pd.DataFrame: | ||||||||||||||||
| def patch_into_df(self, | ||||||||||||||||
| df: pd.DataFrame, | ||||||||||||||||
| col_names: Optional[List[str]] = None) -> pd.DataFrame: | ||||||||||||||||
| """ | ||||||||||||||||
| Given a dataframe, this method will stitch the app_id and app-name to the dataframe. | ||||||||||||||||
| This can be useful in automatically adding the app-id/app-name to the data-frame | ||||||||||||||||
| :param df: the dataframe that we want to modify. | ||||||||||||||||
| :param col_names: optional list of column names that defines the app_id and app_name to the | ||||||||||||||||
| dataframe. It is assumed that the list comes in the order it is inserted in | ||||||||||||||||
| the column names. | ||||||||||||||||
| :return: the resulting dataframe from adding the columns. | ||||||||||||||||
| """ | ||||||||||||||||
| # TODO: We should consider add UUID as well, and use that for the joins instead. | ||||||||||||||||
| # append attempt_id to support multiple attempts | ||||||||||||||||
| col_values = [self.app_id] | ||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This feels weird that
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point!
|
||||||||||||||||
| if col_names is None: | ||||||||||||||||
| # append attemptId to support multi-attempts | ||||||||||||||||
| col_names = ['appId'] | ||||||||||||||||
|
||||||||||||||||
| col_names = ['appId'] | |
| col_names = ['appId'] | |
| # Ensure col_values matches col_names in length | |
| if len(col_values) == 1 and len(col_names) > 1: | |
| col_values = col_values * len(col_names) | |
| elif len(col_values) != len(col_names): | |
| raise ValueError("Length of col_values must be 1 or match length of col_names") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We might want to eventually push the branching down into the core qualx APIs, just so all invocations to
predict()can use the switch, but this is fine (and easier) for now.