From 5c72704d64489aa7a16d68e04967615fb41f6ac4 Mon Sep 17 00:00:00 2001 From: Sultan Orazbayev Date: Mon, 2 May 2022 23:41:26 +0600 Subject: [PATCH 1/2] Add files via upload --- local/to_datetime_code.ipynb | 265 +++++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 local/to_datetime_code.ipynb diff --git a/local/to_datetime_code.ipynb b/local/to_datetime_code.ipynb new file mode 100644 index 0000000..e081c6c --- /dev/null +++ b/local/to_datetime_code.ipynb @@ -0,0 +1,265 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "7532fbcf-132e-4135-98e2-3449fca86a5d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "from dask.dataframe import from_pandas\n", + "from pandas import read_csv\n", + "\n", + "data = StringIO(\n", + " \"\"\"timestamp_start,time_worked\n", + " 2021-01-01 9:25 AM,3 hours 12 minutes\n", + " 2021-02-03 4:25 PM,2 hours\n", + " 2021-03-05 1:25 PM,15 minutes\n", + " 2021-03-05 11:25 PM,55 minutes\n", + " \"\"\"\n", + ")\n", + "df = read_csv(data)\n", + "ddf = from_pandas(df, npartitions=2)\n", + "\n", + "print(ddf.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90b26db6-f54e-4523-91d3-d6939ad98b06", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from dask.dataframe import to_datetime\n", + "\n", + "ddf[\"converted_timestamp_start\"] = to_datetime(ddf[\"timestamp_start\"])\n", + "\n", + "print(ddf.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99ccca84-8bce-4581-9f96-c3c33e442459", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ddf[\"day_of_week\"] = ddf[\"converted_timestamp_start\"].dt.dayofweek\n", + "\n", + "print(ddf[[\"converted_timestamp_start\", \"day_of_week\"]].compute())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3347a460-f662-420a-904d-1d541f4dd231", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from datetime import timedelta\n", + "\n", + "from pandas import to_timedelta\n", + "\n", + "ddf[\"converted_time_worked\"] = (\n", + " ddf[\"time_worked\"].apply(lambda x: to_timedelta(x), meta=timedelta).compute()\n", + ")\n", + "\n", + "print(ddf[[\"converted_timestamp_start\", \"converted_time_worked\"]].compute())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6237fcd-a5d1-41cc-b35e-72a997e2630b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ddf[\"work_completed\"] = ddf[\"converted_timestamp_start\"] + ddf[\"converted_time_worked\"]\n", + "\n", + "print(\n", + " ddf[\n", + " [\"converted_timestamp_start\", \"converted_time_worked\", \"work_completed\"]\n", + " ].compute()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7045a384-88fd-42c0-ab3f-5a6eeea7fef2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "from pandas import read_csv\n", + "\n", + "data = StringIO(\n", + " \"\"\"timestamp_start,time_worked\n", + " 2021-01-01 9:25 AM,3 hours 12 minutes\n", + " 2021-02-03 4:25 PM,2 hours\n", + " missing ,15 minutes\n", + " 2021-03-05 11:?? PM,55 minutes\n", + " \"\"\"\n", + ")\n", + "\n", + "df = read_csv(data)\n", + "ddf = from_pandas(df, npartitions=2)\n", + "\n", + "print(ddf.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5ac51e2-e600-401a-a9f3-70e03026d521", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "from dask.dataframe import from_pandas, to_datetime\n", + "from pandas import read_csv\n", + "\n", + "data = StringIO(\n", + " \"\"\"timestamp_start,time_worked\n", + "2021-01-01 9:25 AM,3 hours 12 minutes\n", + "\"Thursday, October 9, 2022 14:25\",2 hours\n", + "\"January 12, 2022 14:25\",15 minutes\n", + " \"\"\"\n", + ")\n", + "\n", + "df = read_csv(data)\n", + "ddf = from_pandas(df, npartitions=2)\n", + "\n", + "ddf[\"converted_timestamp_start\"] = to_datetime(ddf[\"timestamp_start\"])\n", + "print(ddf[[\"timestamp_start\", \"converted_timestamp_start\"]].compute())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbdc4984-521f-416e-8f52-65eece46f1bf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "from dask.dataframe import from_pandas, to_datetime\n", + "from pandas import read_csv\n", + "\n", + "data = StringIO(\n", + " \"\"\"timestamp_start,time_worked\n", + "year 2021: 01/01 9:25 AM,3 hours 12 minutes\n", + "year 2021: 01/03 3:25 PM,2 hours\n", + "year 2021: 01/05 11:25 AM,2 hours\n", + "\"\"\"\n", + ")\n", + "df = read_csv(data)\n", + "ddf = from_pandas(df, npartitions=2)\n", + "\n", + "ddf[\"converted_timestamp_start\"] = to_datetime(\n", + " ddf[\"timestamp_start\"], format=\"year %Y: %m/%d %I:%M %p\"\n", + ")\n", + "print(ddf[[\"timestamp_start\", \"converted_timestamp_start\"]].compute())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e3eabc2-2d42-419e-a4c9-770823ffe4a4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "from dask.dataframe import from_pandas, to_datetime\n", + "from pandas import read_csv\n", + "\n", + "data = StringIO(\n", + " \"\"\"timestamp_local,location\n", + "2021-01-01 09:01:12,Asia/Almaty\n", + "2021-01-01 09:01:12,Europe/London\n", + "2021-01-01 09:01:12,America/New_York\n", + "\"\"\"\n", + ")\n", + "df = read_csv(data)\n", + "ddf = from_pandas(df, npartitions=2)\n", + "\n", + "ddf[\"converted_date\"] = to_datetime(ddf[\"timestamp_local\"], utc=False)\n", + "print(ddf[[\"timestamp_local\", \"converted_date\"]].compute())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "345a2359-d108-4945-98e2-22f362f3420a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def convert_tz(datetime_object, local_timezone):\n", + " timezone_aware = datetime_object.tz_localize(local_timezone)\n", + " timezone_est = timezone_aware.tz_convert(\"America/New_York\")\n", + " return timezone_est" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b4c385f-d83d-4347-ab32-6f9ec42e60d1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ddf[\"converted_date_tz_aware\"] = ddf[[\"converted_date\", \"location\"]].apply(\n", + " lambda row: convert_tz(row[\"converted_date\"], row[\"location\"]), axis=1, meta=\"\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 3a67453488cbc1af8dac798174622e26dc604ed0 Mon Sep 17 00:00:00 2001 From: Sultan Orazbayev Date: Tue, 3 May 2022 17:50:28 +0600 Subject: [PATCH 2/2] Update to_datetime code --- local/to_datetime_code.ipynb | 173 +++++++++++++++++++++++++---------- 1 file changed, 124 insertions(+), 49 deletions(-) diff --git a/local/to_datetime_code.ipynb b/local/to_datetime_code.ipynb index e081c6c..d04b0f2 100644 --- a/local/to_datetime_code.ipynb +++ b/local/to_datetime_code.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7532fbcf-132e-4135-98e2-3449fca86a5d", + "id": "ec38f3d5-6fb5-4975-818a-1d5a7243a136", "metadata": { "tags": [] }, @@ -25,66 +25,75 @@ "df = read_csv(data)\n", "ddf = from_pandas(df, npartitions=2)\n", "\n", - "print(ddf.dtypes)" + "print(ddf.dtypes)\n", + "# timestamp_start object\n", + "# time_worked object\n", + "# dtype: object" ] }, { "cell_type": "code", "execution_count": null, - "id": "90b26db6-f54e-4523-91d3-d6939ad98b06", - "metadata": { - "tags": [] - }, + "id": "57882342-cc3b-47c8-92c4-b1dcc960772e", + "metadata": {}, "outputs": [], "source": [ "from dask.dataframe import to_datetime\n", "\n", "ddf[\"converted_timestamp_start\"] = to_datetime(ddf[\"timestamp_start\"])\n", "\n", - "print(ddf.dtypes)" + "print(ddf.dtypes)\n", + "# timestamp_start object\n", + "# time_worked object\n", + "# converted_timestamp_start datetime64[ns]\n", + "# dtype: object" ] }, { "cell_type": "code", "execution_count": null, - "id": "99ccca84-8bce-4581-9f96-c3c33e442459", - "metadata": { - "tags": [] - }, + "id": "40457ca3-d467-46a0-b617-68b0d9587e29", + "metadata": {}, "outputs": [], "source": [ "ddf[\"day_of_week\"] = ddf[\"converted_timestamp_start\"].dt.dayofweek\n", "\n", - "print(ddf[[\"converted_timestamp_start\", \"day_of_week\"]].compute())" + "print(ddf[[\"converted_timestamp_start\", \"day_of_week\"]].compute())\n", + "# converted_timestamp_start day_of_week\n", + "# 0 2021-01-01 09:25:00 4\n", + "# 1 2021-02-03 16:25:00 2\n", + "# 2 2021-03-05 13:25:00 4\n", + "# 3 2021-03-05 23:25:00 4" ] }, { "cell_type": "code", "execution_count": null, - "id": "3347a460-f662-420a-904d-1d541f4dd231", + "id": "80dd5017-32cb-4a45-90f4-3d520f84f0cd", "metadata": { "tags": [] }, "outputs": [], "source": [ - "from datetime import timedelta\n", - "\n", - "from pandas import to_timedelta\n", + "from pandas import Timedelta, to_timedelta\n", "\n", "ddf[\"converted_time_worked\"] = (\n", - " ddf[\"time_worked\"].apply(lambda x: to_timedelta(x), meta=timedelta).compute()\n", + " ddf[\"time_worked\"].apply(lambda x: to_timedelta(x), meta=Timedelta).compute()\n", ")\n", "\n", - "print(ddf[[\"converted_timestamp_start\", \"converted_time_worked\"]].compute())" + "print(ddf[[\"converted_timestamp_start\", \"converted_time_worked\"]].compute())\n", + "# converted_timestamp_start converted_time_worked\n", + "# 0 2021-01-01 09:25:00 0 days 03:12:00\n", + "# 1 2021-02-03 16:25:00 0 days 02:00:00\n", + "# 2 2021-03-05 13:25:00 0 days 00:15:00\n", + "# 3 2021-03-05 23:25:00 0 days 00:55:00" ] }, { "cell_type": "code", "execution_count": null, - "id": "a6237fcd-a5d1-41cc-b35e-72a997e2630b", - "metadata": { - "tags": [] - }, + "id": "61d13f66-c822-4607-a603-d33b12ae4c73", + "metadata": {}, "outputs": [], "source": [ "ddf[\"work_completed\"] = ddf[\"converted_timestamp_start\"] + ddf[\"converted_time_worked\"]\n", @@ -93,13 +102,35 @@ " ddf[\n", " [\"converted_timestamp_start\", \"converted_time_worked\", \"work_completed\"]\n", " ].compute()\n", - ")" + ")\n", + "# converted_timestamp_start converted_time_worked work_completed\n", + "# 0 2021-01-01 09:25:00 0 days 03:12:00 2021-01-01 12:37:00\n", + "# 1 2021-02-03 16:25:00 0 days 02:00:00 2021-02-03 18:25:00\n", + "# 2 2021-03-05 13:25:00 0 days 00:15:00 2021-03-05 13:40:00\n", + "# 3 2021-03-05 23:25:00 0 days 00:55:00 2021-03-06 00:20:00" ] }, { "cell_type": "code", "execution_count": null, - "id": "7045a384-88fd-42c0-ab3f-5a6eeea7fef2", + "id": "7c1d19aa-f31e-45d9-8a59-84fe94fe5ecb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ddf[\"converted_timestamp_start\"].dt.floor(\"15 min\").compute()\n", + "# 0 2021-01-01 09:15:00\n", + "# 1 2021-02-03 16:15:00\n", + "# 2 2021-03-05 13:15:00\n", + "# 3 2021-03-05 23:15:00\n", + "# Name: converted_timestamp_start, dtype: datetime64[ns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7532fbcf-132e-4135-98e2-3449fca86a5d", "metadata": { "tags": [] }, @@ -121,13 +152,48 @@ "df = read_csv(data)\n", "ddf = from_pandas(df, npartitions=2)\n", "\n", - "print(ddf.dtypes)" + "print(ddf.dtypes)\n", + "# timestamp_start object\n", + "# time_worked object\n", + "# dtype: object" ] }, { "cell_type": "code", "execution_count": null, - "id": "f5ac51e2-e600-401a-a9f3-70e03026d521", + "id": "540f36d1-b64c-4ba7-9051-ee5af9df3ea8", + "metadata": {}, + "outputs": [], + "source": [ + "print(to_datetime(ddf[\"timestamp_start\"], errors=\"coerce\").compute())\n", + "# 0 2021-01-01 09:25:00\n", + "# 1 2021-02-03 14:25:00\n", + "# 2 NaT\n", + "# 3 NaT\n", + "# dtype: datetime64[ns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c55679c-0c4f-4c5e-8456-c6778ade481d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(to_datetime(ddf[\"timestamp_start\"], errors=\"ignore\").compute())\n", + "# 0 2021-01-01 09:25:00\n", + "# 1 2021-02-03 16:25:00\n", + "# 2 missing\n", + "# 3 2021-03-05 11:?? PM\n", + "# dtype: object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0016b448-9635-4e36-a407-f9e1ed39cbea", "metadata": { "tags": [] }, @@ -150,13 +216,17 @@ "ddf = from_pandas(df, npartitions=2)\n", "\n", "ddf[\"converted_timestamp_start\"] = to_datetime(ddf[\"timestamp_start\"])\n", - "print(ddf[[\"timestamp_start\", \"converted_timestamp_start\"]].compute())" + "print(ddf[[\"timestamp_start\", \"converted_timestamp_start\"]].compute())\n", + "# timestamp_start converted_timestamp_start\n", + "# 0 2021-01-01 9:25 AM 2021-01-01 09:25:00\n", + "# 1 Thursday, October 9, 2022 14:25 2022-10-09 14:25:00\n", + "# 2 January 12, 2022 14:25 2022-01-12 14:25:00" ] }, { "cell_type": "code", "execution_count": null, - "id": "dbdc4984-521f-416e-8f52-65eece46f1bf", + "id": "29f40937-9bea-499a-a3cb-c17c14b7be71", "metadata": { "tags": [] }, @@ -180,16 +250,18 @@ "ddf[\"converted_timestamp_start\"] = to_datetime(\n", " ddf[\"timestamp_start\"], format=\"year %Y: %m/%d %I:%M %p\"\n", ")\n", - "print(ddf[[\"timestamp_start\", \"converted_timestamp_start\"]].compute())" + "print(ddf[[\"timestamp_start\", \"converted_timestamp_start\"]].compute())\n", + "# timestamp_start converted_timestamp_start\n", + "# 0 year 2021: 01/01 9:25 AM 2021-01-01 09:25:00\n", + "# 1 year 2021: 01/03 3:25 PM 2021-01-03 15:25:00\n", + "# 2 year 2021: 01/05 11:25 AM 2021-01-05 11:25:00" ] }, { "cell_type": "code", "execution_count": null, - "id": "9e3eabc2-2d42-419e-a4c9-770823ffe4a4", - "metadata": { - "tags": [] - }, + "id": "110ce98e-d8bf-4501-a767-064f6bee56a3", + "metadata": {}, "outputs": [], "source": [ "from io import StringIO\n", @@ -208,13 +280,17 @@ "ddf = from_pandas(df, npartitions=2)\n", "\n", "ddf[\"converted_date\"] = to_datetime(ddf[\"timestamp_local\"], utc=False)\n", - "print(ddf[[\"timestamp_local\", \"converted_date\"]].compute())" + "print(ddf[[\"timestamp_local\", \"converted_date\"]].compute())\n", + "# timestamp_local converted_date\n", + "# 0 2021-01-01 09:01:12 2021-01-01 09:01:12\n", + "# 1 2021-01-01 09:01:12 2021-01-01 09:01:12\n", + "# 2 2021-01-01 09:01:12 2021-01-01 09:01:12" ] }, { "cell_type": "code", "execution_count": null, - "id": "345a2359-d108-4945-98e2-22f362f3420a", + "id": "e2234c76-cf8d-47ed-bf9e-05a3c299f120", "metadata": { "tags": [] }, @@ -223,21 +299,20 @@ "def convert_tz(datetime_object, local_timezone):\n", " timezone_aware = datetime_object.tz_localize(local_timezone)\n", " timezone_est = timezone_aware.tz_convert(\"America/New_York\")\n", - " return timezone_est" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b4c385f-d83d-4347-ab32-6f9ec42e60d1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + " return timezone_est\n", + "\n", + "\n", "ddf[\"converted_date_tz_aware\"] = ddf[[\"converted_date\", \"location\"]].apply(\n", - " lambda row: convert_tz(row[\"converted_date\"], row[\"location\"]), axis=1, meta=\"\"\n", - ")" + " lambda row: convert_tz(row[\"converted_date\"], row[\"location\"]),\n", + " axis=1,\n", + " meta=(\"converted_date_tz_aware\", \"float\"),\n", + ")\n", + "\n", + "print(ddf[[\"location\", \"converted_date_tz_aware\"]].compute())\n", + "# location converted_date_tz_aware\n", + "# 0 Asia/Almaty 2020-12-31 22:01:12-05:00\n", + "# 1 Europe/London 2021-01-01 04:01:12-05:00\n", + "# 2 America/New_York 2021-01-01 09:01:12-05:00" ] } ],