| 
4 | 4 |    "cell_type": "markdown",  | 
5 | 5 |    "metadata": {  | 
6 | 6 |     "application/vnd.databricks.v1+cell": {  | 
7 |  | -     "cellMetadata": {},  | 
 | 7 | +     "cellMetadata": {  | 
 | 8 | +      "byteLimit": 2048000,  | 
 | 9 | +      "rowLimit": 10000  | 
 | 10 | +     },  | 
8 | 11 |      "inputWidgets": {},  | 
9 | 12 |      "nuid": "7d1791bd-9552-4116-90b2-daa678bbffb8",  | 
10 | 13 |      "showTitle": false,  | 
 | 
21 | 24 |    "execution_count": 0,  | 
22 | 25 |    "metadata": {  | 
23 | 26 |     "application/vnd.databricks.v1+cell": {  | 
24 |  | -     "cellMetadata": {},  | 
 | 27 | +     "cellMetadata": {  | 
 | 28 | +      "byteLimit": 2048000,  | 
 | 29 | +      "rowLimit": 10000  | 
 | 30 | +     },  | 
25 | 31 |      "inputWidgets": {},  | 
26 | 32 |      "nuid": "2901fbc9-49db-4285-8b94-c460f34d4c4f",  | 
27 | 33 |      "showTitle": false,  | 
 | 
40 | 46 |    "cell_type": "markdown",  | 
41 | 47 |    "metadata": {  | 
42 | 48 |     "application/vnd.databricks.v1+cell": {  | 
43 |  | -     "cellMetadata": {},  | 
 | 49 | +     "cellMetadata": {  | 
 | 50 | +      "byteLimit": 2048000,  | 
 | 51 | +      "rowLimit": 10000  | 
 | 52 | +     },  | 
44 | 53 |      "inputWidgets": {},  | 
45 | 54 |      "nuid": "655120e7-6b25-417c-a08a-c6925feaa425",  | 
46 | 55 |      "showTitle": false,  | 
 | 
107 | 116 |    "execution_count": 0,  | 
108 | 117 |    "metadata": {  | 
109 | 118 |     "application/vnd.databricks.v1+cell": {  | 
110 |  | -     "cellMetadata": {},  | 
 | 119 | +     "cellMetadata": {  | 
 | 120 | +      "byteLimit": 2048000,  | 
 | 121 | +      "rowLimit": 10000  | 
 | 122 | +     },  | 
111 | 123 |      "inputWidgets": {},  | 
112 | 124 |      "nuid": "a2368434-0191-416c-aa1d-12cd44cf48e6",  | 
113 | 125 |      "showTitle": false,  | 
114 | 126 |      "tableResultSettingsMap": {},  | 
115 | 127 |      "title": ""  | 
116 | 128 |     }  | 
117 | 129 |    },  | 
118 |  | -   "outputs": [],  | 
 | 130 | +   "outputs": [  | 
 | 131 | +    {  | 
 | 132 | +     "output_type": "stream",  | 
 | 133 | +     "name": "stdout",  | 
 | 134 | +     "output_type": "stream",  | 
 | 135 | +     "text": [  | 
 | 136 | +      "+-------+--------+--------------------+----------+\n|user_id|tweet_id|               tweet|tweet_date|\n+-------+--------+--------------------+----------+\n|    135|      13|Enjoying a great ...|2024-02-01|\n|    136|      14|Another #HappyDay...|2024-02-03|\n|    137|      15|Productivity peak...|2024-02-04|\n|    138|      16|Exploring new tec...|2024-02-04|\n|    139|      17|Gratitude for tod...|2024-02-05|\n|    140|      18|Innovation drives...|2024-02-07|\n|    141|      19|Connecting with n...|2024-02-09|\n+-------+--------+--------------------+----------+\n\n"  | 
 | 137 | +     ]  | 
 | 138 | +    }  | 
 | 139 | +   ],  | 
119 | 140 |    "source": [  | 
120 | 141 |     "tweets_data_3103 = [\n",  | 
121 | 142 |     "    (135, 13, \"Enjoying a great start to the day. #HappyDay #MorningVibes\", \"2024-02-01\"),\n",  | 
 | 
131 | 152 |     "tweets_df_3103 = spark.createDataFrame(tweets_data_3103, tweets_columns_3103)\n",  | 
132 | 153 |     "tweets_df_3103.show()"  | 
133 | 154 |    ]  | 
 | 155 | +  },  | 
 | 156 | +  {  | 
 | 157 | +   "cell_type": "code",  | 
 | 158 | +   "execution_count": 0,  | 
 | 159 | +   "metadata": {  | 
 | 160 | +    "application/vnd.databricks.v1+cell": {  | 
 | 161 | +     "cellMetadata": {  | 
 | 162 | +      "byteLimit": 2048000,  | 
 | 163 | +      "rowLimit": 10000  | 
 | 164 | +     },  | 
 | 165 | +     "inputWidgets": {},  | 
 | 166 | +     "nuid": "f0551d7b-8f01-4cab-86ca-a6eea077c1c9",  | 
 | 167 | +     "showTitle": false,  | 
 | 168 | +     "tableResultSettingsMap": {},  | 
 | 169 | +     "title": ""  | 
 | 170 | +    }  | 
 | 171 | +   },  | 
 | 172 | +   "outputs": [],  | 
 | 173 | +   "source": [  | 
 | 174 | +    "df_hashtags_3103 = tweets_df_3103\\\n",  | 
 | 175 | +    "                    .withColumn( \"hashtags\",\n",  | 
 | 176 | +    "                                expr(\"regexp_extract_all(tweet, '(#[A-Za-z0-9_]+)', 0)\")\n",  | 
 | 177 | +    "                                )"  | 
 | 178 | +   ]  | 
 | 179 | +  },  | 
 | 180 | +  {  | 
 | 181 | +   "cell_type": "code",  | 
 | 182 | +   "execution_count": 0,  | 
 | 183 | +   "metadata": {  | 
 | 184 | +    "application/vnd.databricks.v1+cell": {  | 
 | 185 | +     "cellMetadata": {  | 
 | 186 | +      "byteLimit": 2048000,  | 
 | 187 | +      "rowLimit": 10000  | 
 | 188 | +     },  | 
 | 189 | +     "inputWidgets": {},  | 
 | 190 | +     "nuid": "eac4937b-1147-49ea-b0db-bd25680c661b",  | 
 | 191 | +     "showTitle": false,  | 
 | 192 | +     "tableResultSettingsMap": {},  | 
 | 193 | +     "title": ""  | 
 | 194 | +    }  | 
 | 195 | +   },  | 
 | 196 | +   "outputs": [],  | 
 | 197 | +   "source": [  | 
 | 198 | +    "df_exploded_3103 = df_hashtags_3103\\\n",  | 
 | 199 | +    "                    .withColumn(\"hashtag\", explode(\"hashtags\"))"  | 
 | 200 | +   ]  | 
 | 201 | +  },  | 
 | 202 | +  {  | 
 | 203 | +   "cell_type": "code",  | 
 | 204 | +   "execution_count": 0,  | 
 | 205 | +   "metadata": {  | 
 | 206 | +    "application/vnd.databricks.v1+cell": {  | 
 | 207 | +     "cellMetadata": {  | 
 | 208 | +      "byteLimit": 2048000,  | 
 | 209 | +      "rowLimit": 10000  | 
 | 210 | +     },  | 
 | 211 | +     "inputWidgets": {},  | 
 | 212 | +     "nuid": "930e4528-ab1e-4ce5-af1b-ff829423a1d3",  | 
 | 213 | +     "showTitle": false,  | 
 | 214 | +     "tableResultSettingsMap": {},  | 
 | 215 | +     "title": ""  | 
 | 216 | +    }  | 
 | 217 | +   },  | 
 | 218 | +   "outputs": [  | 
 | 219 | +    {  | 
 | 220 | +     "output_type": "display_data",  | 
 | 221 | +     "data": {  | 
 | 222 | +      "text/html": [  | 
 | 223 | +       "<style scoped>\n",  | 
 | 224 | +       "  .table-result-container {\n",  | 
 | 225 | +       "    max-height: 300px;\n",  | 
 | 226 | +       "    overflow: auto;\n",  | 
 | 227 | +       "  }\n",  | 
 | 228 | +       "  table, th, td {\n",  | 
 | 229 | +       "    border: 1px solid black;\n",  | 
 | 230 | +       "    border-collapse: collapse;\n",  | 
 | 231 | +       "  }\n",  | 
 | 232 | +       "  th, td {\n",  | 
 | 233 | +       "    padding: 5px;\n",  | 
 | 234 | +       "  }\n",  | 
 | 235 | +       "  th {\n",  | 
 | 236 | +       "    text-align: left;\n",  | 
 | 237 | +       "  }\n",  | 
 | 238 | +       "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>hashtag</th><th>count</th></tr></thead><tbody><tr><td>#HappyDay</td><td>3</td></tr><tr><td>#TechLife</td><td>2</td></tr><tr><td>#WorkLife</td><td>1</td></tr></tbody></table></div>"  | 
 | 239 | +      ]  | 
 | 240 | +     },  | 
 | 241 | +     "metadata": {  | 
 | 242 | +      "application/vnd.databricks.v1+output": {  | 
 | 243 | +       "addedWidgets": {},  | 
 | 244 | +       "aggData": [],  | 
 | 245 | +       "aggError": "",  | 
 | 246 | +       "aggOverflow": false,  | 
 | 247 | +       "aggSchema": [],  | 
 | 248 | +       "aggSeriesLimitReached": false,  | 
 | 249 | +       "aggType": "",  | 
 | 250 | +       "arguments": {},  | 
 | 251 | +       "columnCustomDisplayInfos": {},  | 
 | 252 | +       "data": [  | 
 | 253 | +        [  | 
 | 254 | +         "#HappyDay",  | 
 | 255 | +         3  | 
 | 256 | +        ],  | 
 | 257 | +        [  | 
 | 258 | +         "#TechLife",  | 
 | 259 | +         2  | 
 | 260 | +        ],  | 
 | 261 | +        [  | 
 | 262 | +         "#WorkLife",  | 
 | 263 | +         1  | 
 | 264 | +        ]  | 
 | 265 | +       ],  | 
 | 266 | +       "datasetInfos": [],  | 
 | 267 | +       "dbfsResultPath": null,  | 
 | 268 | +       "isJsonSchema": true,  | 
 | 269 | +       "metadata": {},  | 
 | 270 | +       "overflow": false,  | 
 | 271 | +       "plotOptions": {  | 
 | 272 | +        "customPlotOptions": {},  | 
 | 273 | +        "displayType": "table",  | 
 | 274 | +        "pivotAggregation": null,  | 
 | 275 | +        "pivotColumns": null,  | 
 | 276 | +        "xColumns": null,  | 
 | 277 | +        "yColumns": null  | 
 | 278 | +       },  | 
 | 279 | +       "removedWidgets": [],  | 
 | 280 | +       "schema": [  | 
 | 281 | +        {  | 
 | 282 | +         "metadata": "{}",  | 
 | 283 | +         "name": "hashtag",  | 
 | 284 | +         "type": "\"string\""  | 
 | 285 | +        },  | 
 | 286 | +        {  | 
 | 287 | +         "metadata": "{}",  | 
 | 288 | +         "name": "count",  | 
 | 289 | +         "type": "\"long\""  | 
 | 290 | +        }  | 
 | 291 | +       ],  | 
 | 292 | +       "type": "table"  | 
 | 293 | +      }  | 
 | 294 | +     },  | 
 | 295 | +     "output_type": "display_data"  | 
 | 296 | +    }  | 
 | 297 | +   ],  | 
 | 298 | +   "source": [  | 
 | 299 | +    "df_exploded_3103\\\n",  | 
 | 300 | +    "    .groupBy(\"hashtag\").agg(count(\"*\").alias(\"count\"))\\\n",  | 
 | 301 | +    "        .orderBy(desc(\"count\"), desc(\"hashtag\")).limit(3).display()"  | 
 | 302 | +   ]  | 
134 | 303 |   }  | 
135 | 304 |  ],  | 
136 | 305 |  "metadata": {  | 
137 | 306 |   "application/vnd.databricks.v1+notebook": {  | 
138 |  | -   "computePreferences": null,  | 
 | 307 | +   "computePreferences": {  | 
 | 308 | +    "hardware": {  | 
 | 309 | +     "accelerator": null,  | 
 | 310 | +     "gpuPoolId": null,  | 
 | 311 | +     "memory": null  | 
 | 312 | +    }  | 
 | 313 | +   },  | 
139 | 314 |    "dashboards": [],  | 
140 | 315 |    "environmentMetadata": {  | 
141 | 316 |     "base_environment": "",  | 
142 |  | -    "environment_version": "1"  | 
 | 317 | +    "environment_version": "2"  | 
143 | 318 |    },  | 
144 | 319 |    "inputWidgetPreferences": null,  | 
145 | 320 |    "language": "python",  | 
 | 
0 commit comments