8
8
import botsim .modules .remediator .dashboard .plot as dashboard_plot
9
9
from streamlit_chat import message
10
10
11
+
12
+ def plot_dialog_performance_banner (overall_performance , F1_scores , selected_intent , mode ):
13
+ row2_spacer1 , row2_1 , row2_spacer2 , row2_2 , row2_spacer3 , row2_3 , \
14
+ row2_spacer4 , row2_4 , row2_spacer5 , row2_5 , row2_spacer6 , row2_6 = st .columns (
15
+ (.8 , 2.5 , .4 , 2.5 , .4 , 2.5 , .4 , 2.5 , .4 , 2.5 , .4 , 2.5 ))
16
+
17
+ intent_performance = overall_performance [mode .lower ()][selected_intent .replace ("_eval" , "" )]
18
+ row2_1 .metric ("#Sessions" , str (sum (list (intent_performance ["overall_performance" ].values ()))), "" )
19
+ if F1_scores [selected_intent ] < 0.9 :
20
+ row2_2 .metric ("F1 score" , str (F1_scores [selected_intent ]), "" , "inverse" )
21
+ else :
22
+ row2_2 .metric ("F1 score" , str (F1_scores [selected_intent ]), "Good" )
23
+ if intent_performance ["success_rate" ] < 0.7 :
24
+ row2_3 .metric ("Goal-completion Rate" , str (intent_performance ["success_rate" ]), "" , "inverse" )
25
+ else :
26
+ row2_3 .metric ("Goal-completion Rate" , str (intent_performance ["success_rate" ]), "" )
27
+ if intent_performance ["intent_error_rate" ] > 0.5 :
28
+ row2_4 .metric ("Intent Error Rate" , str (intent_performance ["intent_error_rate" ]), "" , "inverse" )
29
+ else :
30
+ row2_4 .metric ("Intent Error Rate" , str (intent_performance ["intent_error_rate" ]), "" )
31
+ if intent_performance ["NER_error_rate" ] > 0.5 :
32
+ row2_5 .metric ("NER Error Rate" , str (intent_performance ["NER_error_rate" ]), "" , "inverse" )
33
+ else :
34
+ row2_5 .metric ("NER Error Rate" , str (intent_performance ["NER_error_rate" ]), "" )
35
+ row2_6 .metric ("Other Error Rate" , str (intent_performance ["other_error_rate" ]), "" )
36
+
37
+
11
38
def render_summary_reports (database , mode , test , dataset_info , overall_performance ):
12
39
row1_spacer1 , row1_1 , row1_spacer2 = st .columns ((.2 , 7.1 , .2 ))
13
40
with row1_1 :
14
41
st .header ("Bot health reports 📊" )
15
- st .markdown ("The bot health reports consist of a summary report across all intents and "
16
- "per-intent reports to show both the task-completion and NLU performance." )
42
+ st .markdown ("The bot health reports comprises 1) a summary report of a simulation session "
43
+ "across all intents and 2) "
44
+ "intent/dialog-specific reports to show both the task-completion and NLU performance." )
17
45
row2_spacer1 , row2_1 , row2_spacer2 = st .columns ((.2 , 7.1 , .4 ))
18
46
with row2_1 :
19
47
st .subheader ("Performance summary for selected test (test_id={}):" .format (test ))
@@ -46,36 +74,15 @@ def render_summary_reports(database, mode, test, dataset_info, overall_performan
46
74
st .plotly_chart (dashboard_plot .plot_test_performance (intent_to_errors ), use_container_width = True )
47
75
48
76
49
- def render_dialog_report (mode , selected_intent , F1s , overall_performance , detailed_performance ):
77
+ def render_dialog_report (mode , selected_intent , F1_scores , overall_performance , detailed_performance ):
50
78
row1_spacer1 , row1_1 , row1_spacer2 = st .columns ((.3 , 7.1 , .4 ))
51
- if not F1s :
52
- F1s = {selected_intent : 1.0 }
79
+ if not F1_scores :
80
+ F1_scores = {selected_intent : 1.0 }
53
81
with row1_1 :
54
82
st .markdown ("---" )
55
83
st .subheader ("Performance report for selected dialog \" " + selected_intent + "\" " )
56
- row2_spacer1 , row2_1 , row2_spacer2 , row2_2 , row2_spacer3 , row2_3 , \
57
- row2_spacer4 , row2_4 , row2_spacer5 , row2_5 , row2_spacer6 , row2_6 = st .columns (
58
- (.8 , 2.5 , .4 , 2.5 , .4 , 2.5 , .4 , 2.5 , .4 , 2.5 , .4 , 2.5 ))
59
84
60
- intent_performance = overall_performance [mode .lower ()][selected_intent .replace ("_eval" , "" )]
61
- row2_1 .metric ("#Sessions" , str (sum (list (intent_performance ["overall_performance" ].values ()))), "" )
62
- if F1s [selected_intent ] < 0.9 :
63
- row2_2 .metric ("F1 score" , str (F1s [selected_intent ]), "" , "inverse" )
64
- else :
65
- row2_2 .metric ("F1 score" , str (F1s [selected_intent ]), "Good" )
66
- if intent_performance ["success_rate" ] < 0.7 :
67
- row2_3 .metric ("Goal-completion Rate" , str (intent_performance ["success_rate" ]), "" , "inverse" )
68
- else :
69
- row2_3 .metric ("Goal-completion Rate" , str (intent_performance ["success_rate" ]), "" )
70
- if intent_performance ["intent_error_rate" ] > 0.5 :
71
- row2_4 .metric ("Intent Error Rate" , str (intent_performance ["intent_error_rate" ]), "" , "inverse" )
72
- else :
73
- row2_4 .metric ("Intent Error Rate" , str (intent_performance ["intent_error_rate" ]), "" )
74
- if intent_performance ["NER_error_rate" ] > 0.5 :
75
- row2_5 .metric ("NER Error Rate" , str (intent_performance ["NER_error_rate" ]), "" , "inverse" )
76
- else :
77
- row2_5 .metric ("NER Error Rate" , str (intent_performance ["NER_error_rate" ]), "" )
78
- row2_6 .metric ("Other Error Rate" , str (intent_performance ["other_error_rate" ]), "" )
85
+ plot_dialog_performance_banner (overall_performance , F1_scores , selected_intent , mode )
79
86
80
87
st .plotly_chart (
81
88
dashboard_plot .plot_intent_performance (
@@ -84,10 +91,10 @@ def render_dialog_report(mode, selected_intent, F1s, overall_performance, detail
84
91
use_container_width = True )
85
92
86
93
87
- def render_remediation (mode , selected_intent , F1s , overall_performance , detailed_performance ):
94
+ def render_remediation (mode , selected_intent , F1_scores , overall_performance , detailed_performance ):
88
95
row1_spacer1 , row1_1 , row1_spacer2 = st .columns ((.2 , 7.1 , .2 ))
89
- if not F1s :
90
- F1s = {selected_intent : 1.0 }
96
+ if not F1_scores :
97
+ F1_scores = {selected_intent : 1.0 }
91
98
with row1_1 :
92
99
st .markdown ("---" )
93
100
st .header ("Remediation Suggestions for {} 🛠️" .format (selected_intent ))
@@ -96,28 +103,7 @@ def render_remediation(mode, selected_intent, F1s, overall_performance, detailed
96
103
"They can also be extended by BotSIM users to include domain expertise or bot-specific "
97
104
"information. " )
98
105
99
- row2_spacer1 , row2_1 , row2_spacer2 , row2_2 , row2_spacer3 , row2_3 , \
100
- row2_spacer4 , row2_4 , row2_spacer5 , row2_5 , row2_spacer6 , row2_6 = st .columns (
101
- (.8 , 2.5 , .4 , 2.5 , .4 , 2.5 , .4 , 2.5 , .4 , 2.5 , .4 , 2.5 ))
102
- intent_performance = overall_performance [mode .lower ()][selected_intent .replace ("_eval" , "" )]
103
- row2_1 .metric ("#Sessions" , str (sum (list (intent_performance ["overall_performance" ].values ()))), "" )
104
- if F1s [selected_intent ] < 0.9 :
105
- row2_2 .metric ("F1 score" , str (F1s [selected_intent ]), "" , "inverse" )
106
- else :
107
- row2_2 .metric ("F1 score" , str (F1s [selected_intent ]), "Good" )
108
- if intent_performance ["success_rate" ] < 0.7 :
109
- row2_3 .metric ("Goal-completion Rate" , str (intent_performance ["success_rate" ]), "" , "inverse" )
110
- else :
111
- row2_3 .metric ("Goal-completion Rate" , str (intent_performance ["success_rate" ]), "" )
112
- if intent_performance ["intent_error_rate" ] > 0.5 :
113
- row2_4 .metric ("Intent Error Rate" , str (intent_performance ["intent_error_rate" ]), "" , "inverse" )
114
- else :
115
- row2_4 .metric ("Intent Error Rate" , str (intent_performance ["intent_error_rate" ]), "" )
116
- if intent_performance ["NER_error_rate" ] > 0.5 :
117
- row2_5 .metric ("NER Error Rate" , str (intent_performance ["NER_error_rate" ]), "" , "inverse" )
118
- else :
119
- row2_5 .metric ("NER Error Rate" , str (intent_performance ["NER_error_rate" ]), "" )
120
- row2_6 .metric ("Other Error Rate" , str (intent_performance ["other_error_rate" ]), "" )
106
+ plot_dialog_performance_banner (overall_performance , F1_scores , selected_intent , mode )
121
107
122
108
row3_spacer1 , row3_1 , row3_spacer2 = st .columns ((.2 , 7.1 , .2 ))
123
109
with row3_1 :
@@ -147,13 +133,13 @@ def render_remediation(mode, selected_intent, F1s, overall_performance, detailed
147
133
if len (droplist_labels ) > 0 :
148
134
row4_spacer1 , row4_1 , row4_spacer2 , row4_2 , row4_spacer3 = st .columns ((.4 , 8.3 , .4 , .4 , .2 ))
149
135
with row4_1 :
150
- st .markdown ("For intent classification models, we show the wrongly predicted paraphrases intent queries "
136
+ st .markdown ("For intent models, we show the wrongly predicted paraphrases intent queries "
151
137
"grouped by their corresponding original"
152
138
" training utterances (**sorted in descending order by number of errors**). "
153
139
"Detailed analysis can be found on the right hand side expander." )
154
140
row5_spacer1 , row5_1 , row5_spacer2 , row5_2 , row5_spacer3 = st .columns ((.4 , 4.3 , .4 , 4.3 , .2 ))
155
141
with row5_1 :
156
- utt_selected = st .selectbox ("Which utterance do you want to analyze ? "
142
+ utt_selected = st .selectbox ("Which utterance do you want to investigate ? "
157
143
"(" + str (len (droplist_labels )) + " in total)" ,
158
144
list (droplist_labels ), key = "utt" )
159
145
with row5_2 :
@@ -213,15 +199,16 @@ def render_remediation(mode, selected_intent, F1s, overall_performance, detailed
213
199
st .json (ner_errors )
214
200
215
201
216
- def render_analytics (database , test , cm_plot , recalls , precisions , F1s , intent_to_clusters , intent_to_supports ,
202
+ def render_analytics (database , test , cm_plot , recalls , precisions , F1_scores , intent_to_clusters , intent_to_supports ,
217
203
all_intents ):
218
204
row1_spacer1 , row1_1 , row1_spacer2 = st .columns ((.2 , 7.1 , .2 ))
219
205
with row1_1 :
220
206
st .markdown ("---" )
221
207
st .header ("Conversation Analytics ⚙️" )
222
- st .markdown ("BotSIM also offers analytical tools for helping users gain more insights into their systems. "
208
+ st .markdown ("Analytical tools for helping users gain insights into their bots for "
209
+ "troubleshooting and improvement. "
223
210
"These tools include confusion matrix analysis, intent utterance tSNE clustering and "
224
- "bootstrap-based confidence analysis " )
211
+ "many more can be added in the layout. " )
225
212
226
213
row2_spacer1 , row2_1 , row2_spacer2 = st .columns ((.4 , 7.1 , .4 ))
227
214
with row2_1 :
@@ -244,24 +231,24 @@ def render_analytics(database, test, cm_plot, recalls, precisions, F1s, intent_t
244
231
245
232
sorted_recall = dict (sorted (recalls .items (), key = lambda item : - item [1 ]))
246
233
sorted_precision = dict (sorted (precisions .items (), key = lambda item : - item [1 ]))
247
- sorted_F1 = dict (sorted (F1s .items (), key = lambda item : - item [1 ]))
234
+ sorted_F1 = dict (sorted (F1_scores .items (), key = lambda item : - item [1 ]))
248
235
table = []
249
236
250
237
if sorted_by == "Sorted by Recall" :
251
238
for intent in sorted_recall :
252
- precision , recall , F1 = sorted_precision [intent ], recalls [intent ], F1s [intent ]
239
+ precision , recall , F1_score = sorted_precision [intent ], recalls [intent ], F1_scores [intent ]
253
240
table .append (
254
- [intent , precision , recall , F1 , intent_to_supports [intent ], intent_to_clusters [intent ]])
241
+ [intent , precision , recall , F1_score , intent_to_supports [intent ], intent_to_clusters [intent ]])
255
242
elif sorted_by == "Sorted by Precision" :
256
243
for intent in sorted_precision :
257
- precision , recall , F1 = sorted_precision [intent ], recalls [intent ], F1s [intent ]
244
+ precision , recall , F1_score = sorted_precision [intent ], recalls [intent ], F1_scores [intent ]
258
245
table .append (
259
- [intent , precision , recall , F1 , intent_to_supports [intent ], intent_to_clusters [intent ]])
246
+ [intent , precision , recall , F1_score , intent_to_supports [intent ], intent_to_clusters [intent ]])
260
247
else :
261
248
for intent in sorted_F1 :
262
- precision , recall , F1 = sorted_precision [intent ], recalls [intent ], F1s [intent ]
249
+ precision , recall , F1_score = sorted_precision [intent ], recalls [intent ], F1_scores [intent ]
263
250
table .append (
264
- [intent , precision , recall , F1 , intent_to_supports [intent ], intent_to_clusters [intent ]])
251
+ [intent , precision , recall , F1_score , intent_to_supports [intent ], intent_to_clusters [intent ]])
265
252
266
253
row4_spacer1 , row4_1 , row4_2 , row4_3 , row4_4 , row4_5 , row4_6 , row4_spacer2 = st .columns (
267
254
(2.3 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 0.5 ))
@@ -290,10 +277,10 @@ def render_analytics(database, test, cm_plot, recalls, precisions, F1s, intent_t
290
277
with row5_1 :
291
278
st .markdown ("---" )
292
279
st .subheader ("tSNE visualisation of intent training utterances" )
293
- st .markdown ("To gauge the quality of the intent training utterances and identify intent overlaps , "
294
- "tSNE clustering is performed based on the sentence transformer embeddings of the intent training "
280
+ st .markdown ("To gauge the intent training data quality , "
281
+ "tSNE clustering is performed on the sentence transformer embeddings of the intent training "
295
282
"utterances. "
296
- "By examining the clusters, not only can users find intents with significant overlap in training "
283
+ "Not only can the clustering identify intents with significant overlap in training "
297
284
"data semantic space, "
298
- "they can also potentially discover novel intents from production logs to aid dialog re-design." )
285
+ "it can also potentially discover novel intents from production logs to aid dialog re-design." )
299
286
st .plotly_chart (dashboard_plot .plot_tSNE (all_intents , database , test ), use_container_width = True )
0 commit comments